From b8d77cdeab8b48d70fd83191debf63b1cabb2f25 Mon Sep 17 00:00:00 2001
From: Arhan Busam <arhan.busam@gmail.com>
Date: Mon, 11 Nov 2024 08:36:25 +1100
Subject: [PATCH 001/106] readme: add llama3.2-vision to model list (#7580)

---
 README.md | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index a7d6cde2..41a4289f 100644
--- a/README.md
+++ b/README.md
@@ -47,26 +47,28 @@ Ollama supports a list of models available on [ollama.com/library](https://ollam
 
 Here are some example models that can be downloaded:
 
-| Model              | Parameters | Size  | Download                       |
-| ------------------ | ---------- | ----- | ------------------------------ |
-| Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`          |
-| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`       |
-| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`          |
-| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`      |
-| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`     |
-| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
-| Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
-| Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`         |
-| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
-| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`        |
-| Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
-| Moondream 2        | 1.4B       | 829MB | `ollama run moondream`         |
-| Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
-| Starling           | 7B         | 4.1GB | `ollama run starling-lm`       |
-| Code Llama         | 7B         | 3.8GB | `ollama run codellama`         |
-| Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored` |
-| LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
-| Solar              | 10.7B      | 6.1GB | `ollama run solar`             |
+| Model              | Parameters | Size  | Download                         |
+| ------------------ | ---------- | ----- | -------------------------------- |
+| Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`            |
+| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`         |
+| Llama 3.2 Vision   | 11B        | 7.9GB | `ollama run llama3.2-vision`     |
+| Llama 3.2 Vision   | 90B        | 55GB  | `ollama run llama3.2-vision:90b` |
+| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`            |
+| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`        |
+| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`       |
+| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`                |
+| Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`         |
+| Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`           |
+| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`              |
+| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`          |
+| Mistral            | 7B         | 4.1GB | `ollama run mistral`             |
+| Moondream 2        | 1.4B       | 829MB | `ollama run moondream`           |
+| Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`         |
+| Starling           | 7B         | 4.1GB | `ollama run starling-lm`         |
+| Code Llama         | 7B         | 3.8GB | `ollama run codellama`           |
+| Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored`   |
+| LLaVA              | 7B         | 4.5GB | `ollama run llava`               |
+| Solar              | 10.7B      | 6.1GB | `ollama run solar`               |
 
 > [!NOTE]
 > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.

From 76b2b723b2a365c7e9e66bf22492760e0bc4ff5a Mon Sep 17 00:00:00 2001
From: Evan <evan@neomantra.net>
Date: Sun, 10 Nov 2024 17:30:27 -0800
Subject: [PATCH 002/106] api: fix typo in python ClientFromEnvironment docs
 (#7604)

---
 api/client.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/client.go b/api/client.go
index 2528fb21..4688d4d1 100644
--- a/api/client.go
+++ b/api/client.go
@@ -55,7 +55,7 @@ func checkError(resp *http.Response, body []byte) error {
 
 // ClientFromEnvironment creates a new [Client] using configuration from the
 // environment variable OLLAMA_HOST, which points to the network host and
-// port on which the ollama service is listenting. The format of this variable
+// port on which the ollama service is listening. The format of this variable
 // is:
 //
 //	<scheme>://<host>:<port>

From 479d5517668a0e8b68be8aae8e2f940efcbfbb60 Mon Sep 17 00:00:00 2001
From: frances720 <francestfls@gmail.com>
Date: Sun, 10 Nov 2024 19:04:23 -0800
Subject: [PATCH 003/106] docs: add mentions of Llama 3.2 (#7517)

---
 docs/import.md    | 8 +++-----
 docs/modelfile.md | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/import.md b/docs/import.md
index 2346886f..b90377bf 100644
--- a/docs/import.md
+++ b/docs/import.md
@@ -32,7 +32,7 @@ ollama run my-model
 
 Ollama supports importing adapters based on several different model architectures including:
 
-  * Llama (including Llama 2, Llama 3, and Llama 3.1);
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
   * Mistral (including Mistral 1, Mistral 2, and Mixtral); and
   * Gemma (including Gemma 1 and Gemma 2)
 
@@ -67,14 +67,12 @@ ollama run my-model
 
 Ollama supports importing models for several different architectures including:
 
-  * Llama (including Llama 2, Llama 3, and Llama 3.1);
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
   * Mistral (including Mistral 1, Mistral 2, and Mixtral);
   * Gemma (including Gemma 1 and Gemma 2); and
   * Phi3
 
-This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
-
-
+This includes importing foundation models as well as any fine tuned models which have been _fused_ with a foundation model.
 ## Importing a GGUF based model or adapter
 
 If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
diff --git a/docs/modelfile.md b/docs/modelfile.md
index aa2849e7..c73f960a 100644
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -120,7 +120,7 @@ FROM <model directory>
 The model directory should contain the Safetensors weights for a supported architecture.
 
 Currently supported model architectures:
-  * Llama (including Llama 2, Llama 3, and Llama 3.1)
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2)
   * Mistral (including Mistral 1, Mistral 2, and Mixtral)
   * Gemma (including Gemma 1 and Gemma 2)
   * Phi3

From 4e94227b5d564030cb8cad47c37e3ce74dc65e1b Mon Sep 17 00:00:00 2001
From: Ivo Stoykov <ivostoykov@users.noreply.github.com>
Date: Mon, 11 Nov 2024 06:14:22 +0000
Subject: [PATCH 004/106] readme: add browser extension that enables using
 Ollama for interacting with web pages (#5827)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 41a4289f..d0f6e386 100644
--- a/README.md
+++ b/README.md
@@ -454,6 +454,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
+- [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
 - [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
 - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)

From 36a8372b2884c40cc5b86f6f859b012dc8125b80 Mon Sep 17 00:00:00 2001
From: Prasad Bhalerao <67261499+prasad89@users.noreply.github.com>
Date: Mon, 11 Nov 2024 12:08:18 +0530
Subject: [PATCH 005/106] readme: add GoLamify to community integrations
 (#7521)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index d0f6e386..7f102082 100644
--- a/README.md
+++ b/README.md
@@ -417,6 +417,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
 - [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
+- [GoLamify](https://github.com/prasad89/golamify)
 
 ### Mobile
 

From d48c1c5a4414a742f6e23e7ea11fc74dd2566c99 Mon Sep 17 00:00:00 2001
From: Evan <evan@neomantra.net>
Date: Mon, 11 Nov 2024 16:21:58 -0800
Subject: [PATCH 006/106] api: fix typos in Go Doc comments (#7620)

---
 api/types.go | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/api/types.go b/api/types.go
index d09ad06c..e5291a02 100644
--- a/api/types.go
+++ b/api/types.go
@@ -12,7 +12,7 @@ import (
 	"time"
 )
 
-// StatusError is an error with and HTTP status code.
+// StatusError is an error with an HTTP status code and message.
 type StatusError struct {
 	StatusCode   int
 	Status       string
@@ -57,7 +57,7 @@ type GenerateRequest struct {
 	Template string `json:"template"`
 
 	// Context is the context parameter returned from a previous call to
-	// Generate call. It can be used to keep a short conversational memory.
+	// [Client.Generate]. It can be used to keep a short conversational memory.
 	Context []int `json:"context,omitempty"`
 
 	// Stream specifies whether the response is streaming; it is true by default.
@@ -90,14 +90,14 @@ type ChatRequest struct {
 	// Messages is the messages of the chat - can be used to keep a chat memory.
 	Messages []Message `json:"messages"`
 
-	// Stream enable streaming of returned response; true by default.
+	// Stream enables streaming of returned responses; true by default.
 	Stream *bool `json:"stream,omitempty"`
 
 	// Format is the format to return the response in (e.g. "json").
 	Format string `json:"format"`
 
 	// KeepAlive controls how long the model will stay loaded into memory
-	// followin the request.
+	// following the request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`
 
 	// Tools is an optional list of tools the model has access to.
@@ -203,8 +203,8 @@ type Metrics struct {
 	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
 }
 
-// Options specified in [GenerateRequest], if you add a new option here add it
-// to the API docs also.
+// Options specified in [GenerateRequest].  If you add a new option here, also
+// add it to the API docs.
 type Options struct {
 	Runner
 

From bebef1e50dca40e6b46c6cbc70eb9b06b3ab4730 Mon Sep 17 00:00:00 2001
From: Joey Zheng <joeyzheng5403@gmail.com>
Date: Tue, 12 Nov 2024 08:44:46 +0800
Subject: [PATCH 007/106] readme: add aichat terminal app to community
 integrations (#7418)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7f102082..1f5cf8fd 100644
--- a/README.md
+++ b/README.md
@@ -361,6 +361,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
 - [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
 - [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
+- [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
 
 ### Apple Vision Pro
 - [Enchanted](https://github.com/AugustDev/enchanted)

From 65973ceb6417c2e2796fa59bd3225bc7bd79b403 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 8 Nov 2024 11:10:56 -0800
Subject: [PATCH 008/106] runner.go: Make KV entry accounting more robust

The structure of the accounting for KV cache shifting was carried
over from the old runner but it now doesn't feel natural with the new
runner. There are a number of invariants that should hold true but
are difficult to reason about. There is at least one bug report
that would imply that the invariants are not holding.

This reduces the number of implicit assumptions and is more forgiving
of unexpected situations. It also improves behavior around which input
tokens are kept when truncation occurs.

Bug #7545
---
 llama/runner/cache.go  | 47 ++++++++++++++++++++--------
 llama/runner/runner.go | 69 +++++++++++++++---------------------------
 2 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/llama/runner/cache.go b/llama/runner/cache.go
index 75c1d874..190ccdff 100644
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@@ -2,6 +2,7 @@ package main
 
 import (
 	"errors"
+	"fmt"
 	"log/slog"
 	"reflect"
 	"time"
@@ -22,7 +23,11 @@ type InputCache struct {
 	lc *llama.Context
 }
 
-func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) *InputCache {
+func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) (*InputCache, error) {
+	if kvSize/numSlots < 1 {
+		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
+	}
+
 	slots := make([]InputCacheSlot, numSlots)
 
 	for i := range slots {
@@ -37,7 +42,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
 		slots:          slots,
 		multiUserCache: multiUserCache,
 		lc:             lc,
-	}
+	}, nil
 }
 
 // Locking: Operations on InputCacheSlot (including finding one
@@ -58,7 +63,7 @@ type InputCacheSlot struct {
 	lastUsed time.Time
 }
 
-func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, int, error) {
+func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
 	var slot *InputCacheSlot
 	var numPast int
 	var err error
@@ -75,7 +80,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 		slot, numPast, err = c.findBestCacheSlot(prompt)
 	}
 	if err != nil {
-		return nil, nil, 0, err
+		return nil, nil, err
 	}
 
 	if !cachePrompt {
@@ -102,7 +107,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 	prompt = prompt[numPast:]
 	slot.Inputs = slot.Inputs[:numPast]
 
-	return slot, prompt, numPast, nil
+	return slot, prompt, nil
 }
 
 func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
@@ -194,14 +199,30 @@ func countCommonPrefix(a []input, b []input) int {
 	return count
 }
 
-func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int, numDiscard int, numPast int) {
-	// TODO (jessegross): KV cache removal can fail for certain types of models
-	// server.cpp doesn't handle this, though we can be more graceful
-	c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+numDiscard)
-	c.lc.KvCacheSeqAdd(slot.Id, numKeep+numDiscard, numPast, -numDiscard)
+// Frees up space in the KV cache by deleting the oldest half of history and shifting
+// the newest half into that space (saving numKeep inputs at the beginning).
+//
+// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
+func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) {
+	targetFree := (c.numCtx - numKeep) / 2
+	targetFree = max(targetFree, 1)
 
-	for i := numKeep + numDiscard; i < len(slot.Inputs); i++ {
-		slot.Inputs[i-numDiscard] = slot.Inputs[i]
+	currentFree := c.numCtx - len(slot.Inputs)
+	discard := targetFree - currentFree
+
+	if discard <= 0 {
+		return
 	}
-	slot.Inputs = slot.Inputs[:len(slot.Inputs)-numDiscard]
+
+	slog.Debug("context limit hit - shifting", "limit", c.numCtx, "input", len(slot.Inputs),
+		"keep", numKeep, "discard", discard)
+
+	// TODO (jessegross): KV cache removal can fail for certain types of models
+	c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard)
+	c.lc.KvCacheSeqAdd(slot.Id, numKeep+discard, len(slot.Inputs), -discard)
+
+	for i := numKeep + discard; i < len(slot.Inputs); i++ {
+		slot.Inputs[i-discard] = slot.Inputs[i]
+	}
+	slot.Inputs = slot.Inputs[:len(slot.Inputs)-discard]
 }
diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index 0a37dee0..b680f060 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -34,9 +34,6 @@ type input struct {
 }
 
 type Sequence struct {
-	// number of inputs evaluated
-	numPast int
-
 	// batch index
 	iBatch int
 
@@ -112,21 +109,15 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 		params.numKeep = len(inputs)
 	}
 
-	if !params.embedding {
-		// Subtracting 4 ensures that at least 1 input can be discarded during shift
-		params.numKeep = min(params.numKeep, s.cache.numCtx-4)
-		params.numKeep += s.bosToken
-	} else {
-		// Embeddings are 1 shot - just truncate to the context window, without ever shifting
-		params.numKeep = min(params.numKeep, s.cache.numCtx)
+	if s.model.AddBOSToken() {
+		params.numKeep += 1
 	}
 
-	// truncate to fit in context window
+	// Ensure that at least 1 input can be discarded during shift
+	params.numKeep = min(params.numKeep, s.cache.numCtx-1)
+
 	if len(inputs) > s.cache.numCtx {
-		slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep)
-		newInputs := inputs[:params.numKeep]
-		newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...)
-		inputs = newInputs
+		slog.Warn("input exceeds context length", "prompt", len(inputs), "limit", s.cache.numCtx)
 	}
 
 	var sc *llama.SamplingContext
@@ -231,9 +222,6 @@ type Server struct {
 	// KV cache
 	cache *InputCache
 
-	// does this model require a beginning of sequence token?
-	bosToken int
-
 	// next sequence for prompt processing to avoid starvation
 	nextSeq int
 
@@ -258,18 +246,6 @@ func (s *Server) allNil() bool {
 	return true
 }
 
-func (s *Server) shiftContext(seq *Sequence) {
-	numLeft := seq.numPast - seq.numKeep
-	numDiscard := numLeft / 2
-
-	slog.Debug("context limit hit - shifting", "limit", s.cache.numCtx, "numPast", seq.numPast,
-		"numKeep", seq.numKeep, "numLeft", numLeft, "numDiscard", numDiscard)
-
-	s.cache.ShiftCacheSlot(seq.cache, seq.numKeep, numDiscard, seq.numPast)
-
-	seq.numPast -= numDiscard
-}
-
 func flushPending(seq *Sequence) bool {
 	joined := strings.Join(seq.pendingResponses, "")
 	seq.pendingResponses = []string{}
@@ -374,12 +350,19 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}
 
-		if seq.numPast+len(seq.inputs) > s.cache.numCtx {
-			s.shiftContext(seq)
-		}
-
 		var numInputsProcessed int
+		shifted := false
+
 		for i, input := range seq.inputs {
+			if len(seq.cache.Inputs)+1 > s.cache.numCtx {
+				if !shifted {
+					s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
+					shifted = true
+				} else {
+					break
+				}
+			}
+
 			embedding := input.embed != nil
 
 			// If we don't currently have a batch, use one of the correct type and
@@ -403,13 +386,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			}
 
 			crossAttention = seq.crossAttention
-			batch.Add(input.token, input.embed, seq.numPast, numInputsProcessed+1 == len(seq.inputs), seq.cache.Id)
-			seq.numPast++
+			batch.Add(input.token, input.embed, len(seq.cache.Inputs), i+1 == len(seq.inputs), seq.cache.Id)
+			seq.cache.Inputs = append(seq.cache.Inputs, input)
 			numInputsProcessed++
 		}
 
 		if numInputsProcessed > 0 {
-			seq.cache.Inputs = append(seq.cache.Inputs, seq.inputs[:numInputsProcessed]...)
 			seq.inputs = seq.inputs[numInputsProcessed:]
 			seq.iBatch = batch.NumTokens() - 1
 		}
@@ -632,7 +614,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
-			seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
+			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
 			if err != nil {
 				s.mu.Unlock()
 				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
@@ -715,7 +697,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
-			seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
+			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
 			if err != nil {
 				s.mu.Unlock()
 				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
@@ -802,10 +784,6 @@ func (s *Server) loadModel(
 		}
 	}
 
-	if s.model.AddBOSToken() {
-		s.bosToken = 1
-	}
-
 	if ppath != "" {
 		var err error
 		s.image, err = NewImageContext(s.lc, ppath)
@@ -814,7 +792,10 @@ func (s *Server) loadModel(
 		}
 	}
 
-	s.cache = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
+	s.cache, err = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
+	if err != nil {
+		panic(err)
+	}
 
 	s.status = ServerStatusReady
 	s.ready.Done()

From 6606e4243c481eb5bcb552c95fa50ee5aa594f3b Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 12 Nov 2024 09:12:50 -0800
Subject: [PATCH 009/106] docs: Capture docker cgroup workaround (#7519)

GPU support can break on some systems after a while.  This captures a
known workaround to solve the problem.
---
 docs/troubleshooting.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 0a89b87f..cbd73c7b 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -97,6 +97,8 @@ On linux, AMD GPU access typically requires `video` and/or `render` group member
 
 When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
 
+If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker.  Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.
+
 If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
 - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems
 - `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported

From ac07160c8da87802a84c01598af5a39b4660b28e Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 12 Nov 2024 09:13:23 -0800
Subject: [PATCH 010/106] doc: capture numeric group requirement (#6941)

Docker uses the container filesystem for name resolution, so we can't guide users
to use the name of the host group.  Instead they must specify the numeric ID.
---
 docs/troubleshooting.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index cbd73c7b..3400b4e8 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -95,7 +95,7 @@ If none of those resolve the problem, gather additional information and file an
 
 On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
 
-When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
+When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` 
 
 If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker.  Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.
 

From df011054fab42766d36cf319421badc4e0e4048a Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:31:52 -0800
Subject: [PATCH 011/106] Jetpack support for Go server (#7217)

This adds support for the Jetson JetPack variants into the Go runner
---
 Dockerfile              | 72 ++++++++++++++++++++++++++++++++++++-----
 discover/amd_linux.go   |  2 +-
 discover/amd_windows.go |  2 +-
 discover/gpu.go         |  8 ++---
 discover/types.go       |  2 +-
 llama/llama.go          |  6 ++--
 llama/make/cuda.make    |  2 +-
 llm/server.go           |  4 +--
 8 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 16d1e4be..ca09325c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,6 +5,8 @@ ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
+ARG JETPACK_6=r36.2.0
+ARG JETPACK_5=r35.4.1
 
 ### To create a local image for building linux binaries on mac or windows with efficient incremental builds
 #
@@ -13,7 +15,7 @@ ARG ROCM_VERSION=6.1.2
 #
 ### Then incremental builds will be much faster in this container
 #
-# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
+# make -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
 #
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
 ARG CMAKE_VERSION
@@ -76,9 +78,9 @@ ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
     if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -C llama -j $(expr $(nproc) / 2 ) ; \
+        make -j $(expr $(nproc) / 2 ) ; \
     else \
-        make -C llama -j 5 ; \
+        make -j 5 ; \
     fi
 
 FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
@@ -90,7 +92,46 @@ ARG CUDA_V11_ARCHITECTURES
 ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
-    make -C llama -j 8
+    make -j 5
+
+# Jetsons need to be built in discrete stages
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
+ARG GOLANG_VERSION
+RUN apt-get update && apt-get install -y git curl ccache && \
+    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
+    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
+    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+WORKDIR /go/src/github.com/ollama/ollama/
+COPY . .
+ARG CGO_CFLAGS
+ENV GOARCH arm64
+RUN --mount=type=cache,target=/root/.ccache \
+    make -j 5 cuda_v11 \
+        CUDA_ARCHITECTURES="72;87" \
+        GPU_RUNNER_VARIANT=_jetpack5 \
+        CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
+        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
+        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS runners-jetpack6-arm64
+ARG GOLANG_VERSION
+RUN apt-get update && apt-get install -y git curl ccache && \
+    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
+    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
+    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+WORKDIR /go/src/github.com/ollama/ollama/
+COPY . .
+ARG CGO_CFLAGS
+ENV GOARCH arm64
+RUN --mount=type=cache,target=/root/.ccache \
+    make -j 5 cuda_v12 \
+        CUDA_ARCHITECTURES="87" \
+        GPU_RUNNER_VARIANT=_jetpack6 \
+        CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
+        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
+        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6
 
 
 # Intermediate stages used for ./scripts/build_linux.sh
@@ -134,12 +175,20 @@ FROM --platform=linux/arm64 builder-arm64 AS build-arm64
 COPY . .
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
     go build -trimpath -o dist/linux-arm64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
     tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
+RUN cd dist/linux-$GOARCH-jetpack5 && \
+    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
+RUN cd dist/linux-$GOARCH-jetpack6 && \
+    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
 
 FROM --platform=linux/amd64 scratch AS dist-amd64
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
@@ -180,16 +229,23 @@ RUN rm -rf \
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
 RUN apt-get update && \
     apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 
 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
 RUN apt-get update && \
     apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+
 
 # ROCm libraries larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
@@ -198,7 +254,7 @@ FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
 RUN apt-get update && \
     apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 
diff --git a/discover/amd_linux.go b/discover/amd_linux.go
index fad7b7a6..d092f6b5 100644
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -350,7 +350,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				return nil, err
 			}
 		}
-		gpuInfo.DependencyPath = libDir
+		gpuInfo.DependencyPath = []string{libDir}
 
 		if gfxOverride == "" {
 			// Only load supported list once
diff --git a/discover/amd_windows.go b/discover/amd_windows.go
index b0c76f1e..efa5cc23 100644
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@@ -111,7 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				UnreliableFreeMemory: true,
 
 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				DependencyPath: libDir,
+				DependencyPath: []string{libDir},
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
 				Compute:        gfx,
diff --git a/discover/gpu.go b/discover/gpu.go
index 808c807b..cf34b904 100644
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -240,7 +240,7 @@ func GetGPUInfo() GpuInfoList {
 					Library:        "cpu",
 					Variant:        cpuCapability.String(),
 					ID:             "0",
-					DependencyPath: depPath,
+					DependencyPath: []string{depPath},
 				},
 				CPUs: details,
 			},
@@ -293,11 +293,11 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
 				if depPath != "" {
-					gpuInfo.DependencyPath = depPath
+					gpuInfo.DependencyPath = []string{depPath}
 					// Check for variant specific directory
 					if variant != "" {
 						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
-							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
+							gpuInfo.DependencyPath = []string{filepath.Join(depPath, "cuda_"+variant), depPath}
 						}
 					}
 				}
@@ -370,7 +370,7 @@ func GetGPUInfo() GpuInfoList {
 						gpuInfo.FreeMemory = uint64(memInfo.free)
 						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-						gpuInfo.DependencyPath = depPath
+						gpuInfo.DependencyPath = []string{depPath}
 						oneapiGPUs = append(oneapiGPUs, gpuInfo)
 					}
 				}
diff --git a/discover/types.go b/discover/types.go
index b2cca109..19f21524 100644
--- a/discover/types.go
+++ b/discover/types.go
@@ -25,7 +25,7 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	MinimumMemory uint64 `json:"-"`
 
 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
-	DependencyPath string `json:"lib_path,omitempty"`
+	DependencyPath []string `json:"lib_path,omitempty"`
 
 	// Extra environment variables specific to the GPU as list of [key,value]
 	EnvWorkarounds [][2]string `json:"envs,omitempty"`
diff --git a/llama/llama.go b/llama/llama.go
index a092ea12..dbb02768 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -21,6 +21,8 @@ package llama
 #cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
+#cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5 -L/usr/local/cuda-11/lib64
+#cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6 -L/usr/local/cuda-12/lib64
 #cgo cuda_v11 LDFLAGS: -lggml_cuda_v11 -L/usr/local/cuda-11/lib64
 #cgo cuda_v12 LDFLAGS: -lggml_cuda_v12 -L/usr/local/cuda-12/lib64
 #cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
@@ -36,8 +38,8 @@ package llama
 #cgo linux CXXFLAGS: -D_GNU_SOURCE
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
-#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
-#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
+#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
+#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
 #cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
 #cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
diff --git a/llama/make/cuda.make b/llama/make/cuda.make
index 7ff1815f..7a4b1036 100644
--- a/llama/make/cuda.make
+++ b/llama/make/cuda.make
@@ -20,7 +20,7 @@ GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
 GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
 GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
 GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
-GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_LIB_DIR)/,$(notdir $(GPU_LIBS))))
+GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
 
 ifeq ($(OS),linux)
 	CUDA_PATH?=/usr/local/cuda
diff --git a/llm/server.go b/llm/server.go
index 5ca6aa32..96815826 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -306,9 +306,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
 		// Note: we always put the dependency path first
 		// since this was the exact version we compiled/linked against
-		if gpus[0].DependencyPath != "" {
+		if gpus[0].DependencyPath != nil {
 			// assume gpus from the same library have the same dependency path
-			libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
+			libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
 		}
 
 		server := filepath.Join(dir, "ollama_llama_server")

From 636a743c2bc5c2a1ed21266b76645cbb117c3ffa Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 12 Nov 2024 11:22:39 -0800
Subject: [PATCH 012/106] CI: give windows lint more time (#7635)

It looks like 8 minutes isn't quite enough and we're seeing sporadic timeouts
---
 .github/workflows/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 5058ca9e..3b50e723 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -281,7 +281,7 @@ jobs:
         shell: bash
       - uses: golangci/golangci-lint-action@v6
         with:
-          args: --timeout 8m0s -v
+          args: --timeout 10m0s -v
   test:
     strategy:
       matrix:

From d7eb05b9361febead29a74e71ddffc2ebeff5302 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 12 Nov 2024 10:41:44 -0800
Subject: [PATCH 013/106] runner.go: Fix off-by-one for num predicted

---
 llama/runner/runner.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index b680f060..cff7d148 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -345,7 +345,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		}
 
 		// if past the num predict limit
-		if seq.numPredict > 0 && seq.numPredicted > seq.numPredict {
+		if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
 			s.removeSequence(seqIdx, "limit")
 			continue
 		}

From 5b3393b6a2920c4f410ee636777533c77752106e Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 13 Nov 2024 14:12:30 -0800
Subject: [PATCH 014/106] fix(mllama): sync backend between batches

---
 llama/llama.go         | 4 ++++
 llama/runner/runner.go | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/llama/llama.go b/llama/llama.go
index a092ea12..df06f0f6 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -598,6 +598,10 @@ func (c *Context) SetCrossAttention(state bool) {
 	C.llama_set_cross_attention(c.c, C.bool(state))
 }
 
+func (c *Context) Synchronize() {
+	C.llama_synchronize(c.c)
+}
+
 // sampling
 // TODO: this is a temporary wrapper to allow calling C++ code from CGo
 type SamplingContext struct {
diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index 0a37dee0..637dd9cc 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -427,6 +427,13 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return
 	}
 
+	if crossAttention {
+		// synchronize state to ensure the cross attention batch is complete.
+		// needed specifically for multi-GPU systems otherwise an inflight
+		// task may be incorrectly invalidated causing a crash
+		s.lc.Synchronize()
+	}
+
 	for i, seq := range s.seqs {
 		if seq == nil {
 			continue

From 67691e410db7a50b07a64858820b14de9aa91314 Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Wed, 13 Nov 2024 23:53:30 -0800
Subject: [PATCH 015/106] cmd: preserve exact bytes when displaying
 template/system layers (#7586)

---
 cmd/cmd.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index b8c9c640..91819c8e 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -800,9 +800,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		case "parameters":
 			fmt.Println(resp.Parameters)
 		case "system":
-			fmt.Println(resp.System)
+			fmt.Print(resp.System)
 		case "template":
-			fmt.Println(resp.Template)
+			fmt.Print(resp.Template)
 		}
 
 		return nil

From 17b386a891af182650f93d528ff78f2fded9efc6 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 12 Nov 2024 11:23:46 -0800
Subject: [PATCH 016/106] runner.go: Enforce NUM_PARALLEL directly in the
 runner

NUM_PARALEL is currently enforced by the Ollama server process - it
will only issue requests to the runner if the maximum number of
concurrent requests has not been exceeded. Although this should
be sufficient, it is good for the runner to protect its own data
structures. Currently, if too many requests get through to the
runner, they will just get stuck and never return.

This may help with reports of Ollama hanging, though it is unclear
how it would actually occur.

Bug #7573
---
 llama/runner/runner.go | 69 +++++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 21 deletions(-)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index e65bd637..c034bc46 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -20,6 +20,8 @@ import (
 	"time"
 	"unicode/utf8"
 
+	"golang.org/x/sync/semaphore"
+
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llama"
 )
@@ -203,38 +205,51 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 }
 
 type Server struct {
-	model *llama.Model
-	lc    *llama.Context
+	// is the server ready to process requests?
+	// protects access to model and image
+	ready sync.WaitGroup
 
-	// required for image embeddings
+	// loaded model
+	model *llama.Model
+
+	// image model context for multi-modal models
 	image *ImageContext
 
+	// status for external health reporting - loading, ready to serve, etc.
+	status ServerStatus
+
+	// current progress on loading the model
+	progress float32
+
+	// number of simultaneous requests to handle
+	parallel int
+
+	// maximum number of elements in a batch (per sequence)
 	// TODO (jmorganca): make this n_batch
 	batchSize int
 
-	// parallel is the number of parallel requests to handle
-	parallel int
+	// protects access to everything below this line
+	// this is context state needed for decoding
+	mu sync.Mutex
 
-	// seqs is the list of parallel sequences being evaluated
-	// TODO (jmorganca): this can probably be moved into run()
+	// indicates that data is ready for processing
+	cond *sync.Cond
+
+	// decoding state
+	lc *llama.Context
+
+	// the list of simultaneous sequences being evaluated
 	seqs []*Sequence
 
+	// seqs can have a maximum of parallel entries, which
+	// is enfoced by seqSem
+	seqsSem *semaphore.Weighted
+
 	// KV cache
 	cache *InputCache
 
 	// next sequence for prompt processing to avoid starvation
 	nextSeq int
-
-	// is the server ready to process requests?
-	ready sync.WaitGroup
-
-	mu sync.Mutex
-
-	cond *sync.Cond
-
-	progress float32
-
-	status ServerStatus
 }
 
 func (s *Server) allNil() bool {
@@ -616,8 +631,13 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 
-	// TODO (jmorganca): add to sequence queue instead of
-	// failing if a slot isn't available
+	// Ensure that a place to put the sequence is available
+	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
+		slog.Error("Failed to acquire semaphore", "error", err)
+		return
+	}
+	defer s.seqsSem.Release(1)
+
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
@@ -700,7 +720,13 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 
-	// TODO (jessegross): Wait for a free slot instead of failing and blocking forever
+	// Ensure that a place to put the sequence is available
+	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
+		slog.Error("Failed to acquire semaphore", "error", err)
+		return
+	}
+	defer s.seqsSem.Release(1)
+
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
@@ -855,6 +881,7 @@ func main() {
 		batchSize: *batchSize,
 		parallel:  *parallel,
 		seqs:      make([]*Sequence, *parallel),
+		seqsSem:   semaphore.NewWeighted(int64(*parallel)),
 		status:    ServerStatusLoadingModel,
 	}
 

From c25ffde91d3d2f8913224ac9bbc28736a4981fa3 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 13 Nov 2024 16:49:01 -0800
Subject: [PATCH 017/106] runner.go: Don't trim whitespace from inputs

It's possible to get prompts that consist entirely of whitespace -
this is most likely to happen when generating embeddings. Currently,
we will trim this away, leaving an empty prompt, which will then
generate an error.

Generating embeddings from whitespace should not trigger an error,
as this may break pipelines. It's better to just leave the whitespace
in place and process what we are given. This is consistent with
past versions of Ollama.

Bug #7578
---
 llama/runner/runner.go | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index c034bc46..0ae50608 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -163,15 +163,13 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 
 	for i, part := range parts {
 		// text - tokenize
-		if strings.TrimSpace(part) != "" {
-			tokens, err := s.lc.Model().Tokenize(part, i == 0, true)
-			if err != nil {
-				return nil, err
-			}
+		tokens, err := s.lc.Model().Tokenize(part, i == 0, true)
+		if err != nil {
+			return nil, err
+		}
 
-			for _, t := range tokens {
-				inputs = append(inputs, input{token: t})
-			}
+		for _, t := range tokens {
+			inputs = append(inputs, input{token: t})
 		}
 
 		// image - generate image embedding

From 0679d491fe7d23ea220a701e3362cf529ec2a599 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Thu, 14 Nov 2024 13:58:25 -0800
Subject: [PATCH 018/106] chore(deps): bump golang.org/x dependencies (#7655)

- golang.org/x/sync v0.3.0 -> v0.9.0
- golang.org/x/image v0.14.0 -> v0.22.0
- golang.org/x/text v0.15.0 -> v0.20.0
---
 go.mod | 6 +++---
 go.sum | 6 ++++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/go.mod b/go.mod
index b2072069..8102c6bc 100644
--- a/go.mod
+++ b/go.mod
@@ -12,7 +12,7 @@ require (
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
 	github.com/x448/float16 v0.8.4
-	golang.org/x/sync v0.3.0
+	golang.org/x/sync v0.9.0
 )
 
 require (
@@ -22,7 +22,7 @@ require (
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
-	golang.org/x/image v0.14.0
+	golang.org/x/image v0.22.0
 )
 
 require (
@@ -73,7 +73,7 @@ require (
 	golang.org/x/net v0.25.0 // indirect
 	golang.org/x/sys v0.20.0
 	golang.org/x/term v0.20.0
-	golang.org/x/text v0.15.0
+	golang.org/x/text v0.20.0
 	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
diff --git a/go.sum b/go.sum
index d4d1c9a9..e98adeaa 100644
--- a/go.sum
+++ b/go.sum
@@ -232,6 +232,8 @@ golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+o
 golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/image v0.14.0 h1:tNgSxAFe3jC4uYqvZdTr84SZoM1KfwdC9SKIFrLjFn4=
 golang.org/x/image v0.14.0/go.mod h1:HUYqC05R2ZcZ3ejNQsIHQDQiwWM4JBqmm6MKANTp4LE=
+golang.org/x/image v0.22.0 h1:UtK5yLUzilVrkjMAZAZ34DXGpASN8i8pj8g+O+yd10g=
+golang.org/x/image v0.22.0/go.mod h1:9hPFhljd4zZ1GNSIZJ49sqbp45GKK9t6w+iXvGqZUz4=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
 golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
 golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
@@ -267,6 +269,8 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
 golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
+golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
+golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -293,6 +297,8 @@ golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
 golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
+golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

From 4efb98cb4fcd713cf4d272154f0e7201901193d1 Mon Sep 17 00:00:00 2001
From: Patrick Devine <patrick@infrahq.com>
Date: Thu, 14 Nov 2024 13:59:44 -0800
Subject: [PATCH 019/106] add line numbers for parser errors (#7326)

---
 parser/parser.go      | 32 +++++++++++++++++++++++++++---
 parser/parser_test.go | 45 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/parser/parser.go b/parser/parser.go
index 7f566da4..cc78d1aa 100644
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -65,9 +65,22 @@ var (
 	errInvalidCommand     = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
 )
 
+type ParserError struct {
+	LineNumber int
+	Msg        string
+}
+
+func (e *ParserError) Error() string {
+	if e.LineNumber > 0 {
+		return fmt.Sprintf("(line %d): %s", e.LineNumber, e.Msg)
+	}
+	return e.Msg
+}
+
 func ParseFile(r io.Reader) (*File, error) {
 	var cmd Command
 	var curr state
+	var currLine int = 1
 	var b bytes.Buffer
 	var role string
 
@@ -84,11 +97,18 @@ func ParseFile(r io.Reader) (*File, error) {
 			return nil, err
 		}
 
+		if isNewline(r) {
+			currLine++
+		}
+
 		next, r, err := parseRuneForState(r, curr)
 		if errors.Is(err, io.ErrUnexpectedEOF) {
 			return nil, fmt.Errorf("%w: %s", err, b.String())
 		} else if err != nil {
-			return nil, err
+			return nil, &ParserError{
+				LineNumber: currLine,
+				Msg:        err.Error(),
+			}
 		}
 
 		// process the state transition, some transitions need to be intercepted and redirected
@@ -96,7 +116,10 @@ func ParseFile(r io.Reader) (*File, error) {
 			switch curr {
 			case stateName:
 				if !isValidCommand(b.String()) {
-					return nil, errInvalidCommand
+					return nil, &ParserError{
+						LineNumber: currLine,
+						Msg:        errInvalidCommand.Error(),
+					}
 				}
 
 				// next state sometimes depends on the current buffer value
@@ -117,7 +140,10 @@ func ParseFile(r io.Reader) (*File, error) {
 				cmd.Name = b.String()
 			case stateMessage:
 				if !isValidMessageRole(b.String()) {
-					return nil, errInvalidMessageRole
+					return nil, &ParserError{
+						LineNumber: currLine,
+						Msg:        errInvalidMessageRole.Error(),
+					}
 				}
 
 				role = b.String()
diff --git a/parser/parser_test.go b/parser/parser_test.go
index 6a4d853f..deadafd0 100644
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -3,6 +3,7 @@ package parser
 import (
 	"bytes"
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"io"
 	"strings"
@@ -180,8 +181,15 @@ func TestParseFileBadCommand(t *testing.T) {
 FROM foo
 BADCOMMAND param1 value1
 `
+	parserError := &ParserError{
+		LineNumber: 3,
+		Msg:        errInvalidCommand.Error(),
+	}
+
 	_, err := ParseFile(strings.NewReader(input))
-	require.ErrorIs(t, err, errInvalidCommand)
+	if !errors.As(err, &parserError) {
+		t.Errorf("unexpected error: expected: %s, actual: %s", parserError.Error(), err.Error())
+	}
 }
 
 func TestParseFileMessages(t *testing.T) {
@@ -245,7 +253,10 @@ FROM foo
 MESSAGE badguy I'm a bad guy!
 `,
 			nil,
-			errInvalidMessageRole,
+			&ParserError{
+				LineNumber: 3,
+				Msg:        errInvalidMessageRole.Error(),
+			},
 		},
 		{
 			`
@@ -264,13 +275,35 @@ MESSAGE system`,
 		},
 	}
 
-	for _, c := range cases {
+	for _, tt := range cases {
 		t.Run("", func(t *testing.T) {
-			modelfile, err := ParseFile(strings.NewReader(c.input))
-			require.ErrorIs(t, err, c.err)
+			modelfile, err := ParseFile(strings.NewReader(tt.input))
+
 			if modelfile != nil {
-				assert.Equal(t, c.expected, modelfile.Commands)
+				assert.Equal(t, tt.expected, modelfile.Commands)
 			}
+
+			if tt.err == nil {
+				if err != nil {
+					t.Fatalf("expected no error, but got %v", err)
+				}
+				return
+			}
+
+			switch tt.err.(type) {
+			case *ParserError:
+				var pErr *ParserError
+				if errors.As(err, &pErr) {
+					// got the correct type of error
+					return
+				}
+			}
+
+			if errors.Is(err, tt.err) {
+				return
+			}
+
+			t.Fatalf("unexpected error: expected: %v, actual: %v", tt.err, err)
 		})
 	}
 }

From a0ea067b63ad61016a44c1c7a86bffbfa678035a Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 14 Nov 2024 16:02:01 -0800
Subject: [PATCH 020/106] build: fix arm container image (#7674)

Fix a rebase glitch from the old C++ runner build model
---
 Dockerfile     | 10 +++-------
 docs/docker.md |  3 +++
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ca09325c..baf259d4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -234,17 +234,13 @@ COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-am
 COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 
 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
 RUN apt-get update && \
     apt-get install -y ca-certificates && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
+COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
 
 
 # ROCm libraries larger so we keep it distinct from the CPU/CUDA image
diff --git a/docs/docker.md b/docs/docker.md
index 9c758c38..9dd387e3 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -50,6 +50,9 @@ sudo systemctl restart docker
 docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
 ```
 
+> [!NOTE]  
+> If you're running on an NVIDIA JetPack system, Ollama can't automatically discover the correct JetPack version. Pass the environment variable JETSON_JETPACK=5 or JETSON_JETPACK=6 to the container to select version 5 or 6.
+
 ### AMD GPU
 
 To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:

From 8a35bb926e63cd36e221eafc4dd4054fbdcd398b Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 14 Nov 2024 15:01:48 -0800
Subject: [PATCH 021/106] runner.go: Increase survivability of main processing
 loop

Currently, if an error occurs during the prep stages (such as
tokenizing) of a single request, it will only affect that request.
However, if an error happens during decoding, it can take down the
entire runner.

Instead, it's better to drop the tokens that triggered the error and try to
keep going. However, we also need to stop when we run out of tokens,
otherwise, this just causes an infinite loop. This is likely the cause
of at least some of the hanging issues that have been reported.

Bug #7573
---
 llama/runner/runner.go | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index 0ae50608..3ffb57bb 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -14,6 +14,7 @@ import (
 	"path/filepath"
 	"regexp"
 	"runtime"
+	"runtime/debug"
 	"strconv"
 	"strings"
 	"sync"
@@ -339,6 +340,15 @@ func (s *Server) run(ctx context.Context) {
 // it should only be responsible for accepting tokens or embeddings and
 // processing batches as fast as possible
 func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) {
+	// Try to keep going even if we hit a panic so that corner cases don't take the whole
+	// runner down. In most cases, this will result in dropping the tokens that we are currently
+	// processing and then continuing with what is remaining.
+	defer func() {
+		if err := recover(); err != nil {
+			slog.Error("error while processing batch", "error", err, "stack", debug.Stack())
+		}
+	}()
+
 	s.mu.Lock()
 	for s.allNil() {
 		s.cond.Wait() // Wait until an item is added
@@ -357,6 +367,14 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}
 
+		// If an error occurred during the processing of a previous batch then we may have emptied the inputs
+		// without adding a new one. In this case, end the sequence rather than infinite looping.
+		if len(seq.inputs) == 0 {
+			slog.Error("removing sequence due to no input tokens", "index", seqIdx, "cache id", seq.cache.Id)
+			s.removeSequence(seqIdx, "error")
+			continue
+		}
+
 		// if past the num predict limit
 		if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
 			s.removeSequence(seqIdx, "limit")

From d875e99e4639dc07af90b2e3ea0d175e2e692efb Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 15 Nov 2024 11:34:30 -0800
Subject: [PATCH 022/106] runner.go: Propagate panics back to the user.

This is a partial revert of 8a35bb92
"runner.go: Increase survivability of main processing loop", removing
the panic handler.

Although we want to avoid errors taking down the runner, we also
should make the user aware of problems when they happen. In the
future, we can restructure things so both parts are true.
---
 llama/runner/runner.go | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index 3ffb57bb..a2da546f 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -14,7 +14,6 @@ import (
 	"path/filepath"
 	"regexp"
 	"runtime"
-	"runtime/debug"
 	"strconv"
 	"strings"
 	"sync"
@@ -340,15 +339,6 @@ func (s *Server) run(ctx context.Context) {
 // it should only be responsible for accepting tokens or embeddings and
 // processing batches as fast as possible
 func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) {
-	// Try to keep going even if we hit a panic so that corner cases don't take the whole
-	// runner down. In most cases, this will result in dropping the tokens that we are currently
-	// processing and then continuing with what is remaining.
-	defer func() {
-		if err := recover(); err != nil {
-			slog.Error("error while processing batch", "error", err, "stack", debug.Stack())
-		}
-	}()
-
 	s.mu.Lock()
 	for s.allNil() {
 		s.cond.Wait() // Wait until an item is added

From 4759d879f2376ffb9b82f296e442ec8ef137f27b Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 15 Nov 2024 16:47:54 -0800
Subject: [PATCH 023/106] Install support for jetpacks (#7632)

Follow up to #7217 - merge after release
---
 scripts/install.sh | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/scripts/install.sh b/scripts/install.sh
index 79a7b564..ec58ddbd 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -93,6 +93,22 @@ else
     fi
 fi
 
+# Check for NVIDIA JetPack systems with additional downloads
+if [ -f /etc/nv_tegra_release ] ; then
+    if grep R36 /etc/nv_tegra_release > /dev/null ; then
+        status "Downloading JetPack 6 components"
+        curl --fail --show-error --location --progress-bar \
+            "https://ollama.com/download/ollama-linux-${ARCH}-jetpack6.tgz${VER_PARAM}" | \
+            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+    elif grep R35 /etc/nv_tegra_release > /dev/null ; then
+        status "Downloading JetPack 5 components"
+        curl --fail --show-error --location --progress-bar \
+            "https://ollama.com/download/ollama-linux-${ARCH}-jetpack5.tgz${VER_PARAM}" | \
+            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+    else
+        warning "Unsupported JetPack version detected.  GPU may not be supported"
+    fi
+fi
 
 install_success() {
     status 'The Ollama API is now available at 127.0.0.1:11434.'
@@ -163,6 +179,13 @@ if [ "$IS_WSL2" = true ]; then
     exit 0
 fi
 
+# Don't attempt to install drivers on Jetson systems
+if [ -f /etc/nv_tegra_release ] ; then
+    status "NVIDIA JetPack ready."
+    install_success
+    exit 0
+fi
+
 # Install GPU dependencies on Linux
 if ! available lspci && ! available lshw; then
     warning "Unable to detect NVIDIA/AMD GPU. Install lspci or lshw to automatically detect and install GPU dependencies."

From b42a596425037148286281a1942dbff0bc9733f5 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sun, 17 Nov 2024 11:48:12 -0800
Subject: [PATCH 024/106] docs: add customization section in linux.md (#7709)

---
 docs/linux.md | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/docs/linux.md b/docs/linux.md
index 0eec014f..8204ece5 100644
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -112,6 +112,21 @@ sudo systemctl status ollama
 > https://www.amd.com/en/support/linux-drivers for best support of your Radeon
 > GPU.
 
+## Customizing
+
+To customize the installation of Ollama, you can edit the systemd service file or the environment variables by running:
+
+```
+sudo systemctl edit ollama
+```
+
+Alternatively, create an override file manually in `/etc/systemd/system/ollama.service.d/override.conf`:
+
+```ini
+[Service]
+Environment="OLLAMA_DEBUG=1"
+```
+
 ## Updating
 
 Update Ollama by running the install script again:
@@ -129,7 +144,7 @@ sudo tar -C /usr -xzf ollama-linux-amd64.tgz
 
 ## Installing specific versions
 
-Use `OLLAMA_VERSION` environment variable with the install script to install a specific version of Ollama, including pre-releases. You can find the version numbers in the [releases page](https://github.com/ollama/ollama/releases). 
+Use `OLLAMA_VERSION` environment variable with the install script to install a specific version of Ollama, including pre-releases. You can find the version numbers in the [releases page](https://github.com/ollama/ollama/releases).
 
 For example:
 

From 8b4b243f5fd31000515548e52bf66bcdb72f70e5 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sun, 17 Nov 2024 13:01:04 -0800
Subject: [PATCH 025/106] server: fix warnings in prompt_test.go (#7710)

---
 server/prompt_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/prompt_test.go b/server/prompt_test.go
index 6d04db53..21a5fd62 100644
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -32,7 +32,7 @@ func TestChatPrompt(t *testing.T) {
 	mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
 
 	createImg := func(width, height int) ([]byte, error) {
-		img := image.NewRGBA(image.Rect(0, 0, 5, 5))
+		img := image.NewRGBA(image.Rect(0, 0, width, height))
 		var buf bytes.Buffer
 
 		if err := png.Encode(&buf, img); err != nil {

From 1c041171141f76b64669f990ca1ff228ce2968b6 Mon Sep 17 00:00:00 2001
From: Vinh Nguyen <1097578+vinhnx@users.noreply.github.com>
Date: Mon, 18 Nov 2024 05:35:41 +0700
Subject: [PATCH 026/106] readme: add the VT app to the community integrations
 section (#7706)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 1f5cf8fd..0bdab5fd 100644
--- a/README.md
+++ b/README.md
@@ -335,6 +335,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate]((https://github.com/rapidarchitect/reddit_analyzer)) (Search and Rate Reddit topics with a weighted summation)
+- [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app with dynamic conversation routing, support both models backend by Ollama)
 
 ### Terminal
 

From d5da2ab7e82a04ec72e62b830a3f59c6ca601be6 Mon Sep 17 00:00:00 2001
From: Tushar Adhatrao <40828350+tusharad@users.noreply.github.com>
Date: Mon, 18 Nov 2024 04:48:04 +0530
Subject: [PATCH 027/106] readme: add ollama-haskell library to community
 integrations (#7451)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 0bdab5fd..49938b37 100644
--- a/README.md
+++ b/README.md
@@ -420,6 +420,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
 - [GoLamify](https://github.com/prasad89/golamify)
+- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
 
 ### Mobile
 

From c9a5aca3daf3bd8e704f668c6995926b56a7e65f Mon Sep 17 00:00:00 2001
From: Darius Kocar <60488234+DariusKocar@users.noreply.github.com>
Date: Sun, 17 Nov 2024 15:19:26 -0800
Subject: [PATCH 028/106] readme: add Perfect Memory AI to community
 integrations (#7431)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 49938b37..9083b3b2 100644
--- a/README.md
+++ b/README.md
@@ -333,6 +333,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
+- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate]((https://github.com/rapidarchitect/reddit_analyzer)) (Search and Rate Reddit topics with a weighted summation)
 - [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app with dynamic conversation routing, support both models backend by Ollama)

From 760cfa27e503cd56ee61207c5ac9dfd66761ff44 Mon Sep 17 00:00:00 2001
From: Nicolas Bonamy <nicolas@bonamy.fr>
Date: Sun, 17 Nov 2024 18:33:10 -0600
Subject: [PATCH 029/106] readme: add Witsy and multi-llm-ts to community
 integrations (#7713)

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 9083b3b2..9623af65 100644
--- a/README.md
+++ b/README.md
@@ -337,6 +337,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate]((https://github.com/rapidarchitect/reddit_analyzer)) (Search and Rate Reddit topics with a weighted summation)
 - [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app with dynamic conversation routing, support both models backend by Ollama)
+- [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux) 
 
 ### Terminal
 
@@ -422,6 +423,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
+- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
 
 ### Mobile
 

From a14f76491d694b2f5a0dec6473514b7f93beeea0 Mon Sep 17 00:00:00 2001
From: Vinh Nguyen <1097578+vinhnx@users.noreply.github.com>
Date: Mon, 18 Nov 2024 10:30:22 +0700
Subject: [PATCH 030/106] readme: improve Community Integrations section
 (#7718)

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9623af65..62ba4aba 100644
--- a/README.md
+++ b/README.md
@@ -335,8 +335,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
 - [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
-- [Reddit Rate]((https://github.com/rapidarchitect/reddit_analyzer)) (Search and Rate Reddit topics with a weighted summation)
-- [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app with dynamic conversation routing, support both models backend by Ollama)
+- [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
+- [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
 - [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux) 
 
 ### Terminal

From 81d55d3e4d3e18404414900dd341438aad329656 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Mon, 18 Nov 2024 11:48:13 -0800
Subject: [PATCH 031/106] fix index out of range on zero layer metal load
 (#7696)

If the model doesn't fit any layers on metal, and we load zero layers
we would panic trying to look up the GPU size during scheduling ops
---
 llm/server.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llm/server.go b/llm/server.go
index 96815826..624acbf8 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -1092,7 +1092,9 @@ func (s *llmServer) EstimatedTotal() uint64 {
 func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
 	for i, gpu := range s.gpus {
 		if gpu.ID == gpuID {
-			return s.estimate.GPUSizes[i]
+			if i < len(s.estimate.GPUSizes) {
+				return s.estimate.GPUSizes[i]
+			}
 		}
 	}
 	return 0

From 35096a7eff0bc19e50def69f75138b55244d31c5 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Mon, 18 Nov 2024 14:39:52 -0800
Subject: [PATCH 032/106] win: add right click menu support (#7727)

Enable both left and right click on the pop-up menu
---
 app/tray/wintray/tray.go   | 2 +-
 app/tray/wintray/w32api.go | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/app/tray/wintray/tray.go b/app/tray/wintray/tray.go
index 6f827893..19fa98e9 100644
--- a/app/tray/wintray/tray.go
+++ b/app/tray/wintray/tray.go
@@ -361,7 +361,7 @@ func (t *winTray) showMenu() error {
 
 	boolRet, _, err = pTrackPopupMenu.Call(
 		uintptr(t.menus[0]),
-		TPM_BOTTOMALIGN|TPM_LEFTALIGN,
+		TPM_BOTTOMALIGN|TPM_LEFTALIGN|TPM_RIGHTBUTTON,
 		uintptr(p.X),
 		uintptr(p.Y),
 		0,
diff --git a/app/tray/wintray/w32api.go b/app/tray/wintray/w32api.go
index 7c7c0ac8..d23bfd97 100644
--- a/app/tray/wintray/w32api.go
+++ b/app/tray/wintray/w32api.go
@@ -67,6 +67,7 @@ const (
 	SW_HIDE             = 0
 	TPM_BOTTOMALIGN     = 0x0020
 	TPM_LEFTALIGN       = 0x0000
+	TPM_RIGHTBUTTON     = 0x0002
 	WM_CLOSE            = 0x0010
 	WM_USER             = 0x0400
 	WS_CAPTION          = 0x00C00000

From 5c18e66384de7f8106fc3b26bfafe0145ed5f7a9 Mon Sep 17 00:00:00 2001
From: frob <rick+github@frob.com.au>
Date: Tue, 19 Nov 2024 00:02:41 +0100
Subject: [PATCH 033/106] Notify the user if systemd is not running (#6693)

Co-authored-by: Richard Lyons <frob@cloudstaff.com>
---
 scripts/install.sh | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/scripts/install.sh b/scripts/install.sh
index ec58ddbd..850800a0 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -4,9 +4,12 @@
 
 set -eu
 
+red="$( (/usr/bin/tput bold; /usr/bin/tput setaf 1; :) 2>&-)"
+plain="$( (/usr/bin/tput sgr0; :) 2>&-)"
+
 status() { echo ">>> $*" >&2; }
-error() { echo "ERROR $*"; exit 1; }
-warning() { echo "WARNING: $*"; }
+error() { echo "${red}ERROR:${plain} $*"; exit 1; }
+warning() { echo "${red}WARNING:${plain} $*"; }
 
 TEMP_DIR=$(mktemp -d)
 cleanup() { rm -rf $TEMP_DIR; }
@@ -162,6 +165,12 @@ EOF
             start_service() { $SUDO systemctl restart ollama; }
             trap start_service EXIT
             ;;
+        *)
+            warning "systemd is not running"
+            if [ "$IS_WSL2" = true ]; then
+                warning "see https://learn.microsoft.com/en-us/windows/wsl/systemd#how-to-enable-systemd to enable it"
+            fi
+            ;;
     esac
 }
 

From 6cdf27d154e7df12d6b39cc059364a37f78679a2 Mon Sep 17 00:00:00 2001
From: Patrick Sy <patr.sy@gmail.com>
Date: Tue, 19 Nov 2024 04:33:23 +0100
Subject: [PATCH 034/106] readme: add Alfred Ollama to community integrations
 (#7724)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 62ba4aba..e51be1e4 100644
--- a/README.md
+++ b/README.md
@@ -467,6 +467,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
 - [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
 - [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
+- [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
 
 ### Supported backends
 

From 712d63c3f06f297e22b1ae32678349187dccd2e4 Mon Sep 17 00:00:00 2001
From: Patrick Devine <patrick@infrahq.com>
Date: Mon, 18 Nov 2024 21:17:38 -0800
Subject: [PATCH 035/106] update the docs (#7731)

---
 docs/api.md | 74 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 12 deletions(-)

diff --git a/docs/api.md b/docs/api.md
index 2836d73f..9fd87590 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -830,10 +830,30 @@ Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `m
 
 ### Parameters
 
-- `name`: name of the model to create
+- `model`: name of the model to create
 - `modelfile` (optional): contents of the Modelfile
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
 - `path` (optional): path to the Modelfile
+- `quantize` (optional): quantize a non-quantized (e.g. float16) model
+
+#### Quantization types
+
+| Type | Recommended |
+| --- | :-: |
+| q2_K | |
+| q3_K_L | |
+| q3_K_M | |
+| q3_K_S | |
+| q4_0 | |
+| q4_1 | |
+| q4_K_M | * |
+| q4_K_S | |
+| q5_0 | |
+| q5_1 | |
+| q5_K_M | |
+| q5_K_S | |
+| q6_K | |
+| q8_0 | * |
 
 ### Examples
 
@@ -845,14 +865,14 @@ Create a new model from a `Modelfile`.
 
 ```shell
 curl http://localhost:11434/api/create -d '{
-  "name": "mario",
+  "model": "mario",
   "modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
 }'
 ```
 
 ##### Response
 
-A stream of JSON objects. Notice that the final JSON object shows a `"status": "success"`.
+A stream of JSON objects is returned:
 
 ```json
 {"status":"reading model metadata"}
@@ -868,13 +888,43 @@ A stream of JSON objects. Notice that the final JSON object shows a `"status": "
 {"status":"success"}
 ```
 
+#### Quantize a model
+
+Quantize a non-quantized model.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/create -d '{
+  "model": "llama3.1:quantized",
+  "modelfile": "FROM llama3.1:8b-instruct-fp16",
+  "quantize": "q4_K_M"
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```
+{"status":"quantizing F16 model to Q4_K_M"}
+{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
+{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
+{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
+{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
+{"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
+{"status":"writing manifest"}
+{"status":"success"}
+```
+
+
 ### Check if a Blob Exists
 
 ```shell
 HEAD /api/blobs/:digest
 ```
 
-Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not Ollama.ai.
+Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not ollama.com.
 
 #### Query Parameters
 
@@ -979,7 +1029,7 @@ Show information about a model including details, modelfile, template, parameter
 
 ### Parameters
 
-- `name`: name of the model to show
+- `model`: name of the model to show
 - `verbose`: (optional) if set to `true`, returns full data for verbose response fields
 
 ### Examples
@@ -988,7 +1038,7 @@ Show information about a model including details, modelfile, template, parameter
 
 ```shell
 curl http://localhost:11434/api/show -d '{
-  "name": "llama3.2"
+  "model": "llama3.2"
 }'
 ```
 
@@ -1068,7 +1118,7 @@ Delete a model and its data.
 
 ### Parameters
 
-- `name`: model name to delete
+- `model`: model name to delete
 
 ### Examples
 
@@ -1076,7 +1126,7 @@ Delete a model and its data.
 
 ```shell
 curl -X DELETE http://localhost:11434/api/delete -d '{
-  "name": "llama3:13b"
+  "model": "llama3:13b"
 }'
 ```
 
@@ -1094,7 +1144,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
 
 ### Parameters
 
-- `name`: name of the model to pull
+- `model`: name of the model to pull
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
 
@@ -1104,7 +1154,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
 
 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "name": "llama3.2"
+  "model": "llama3.2"
 }'
 ```
 
@@ -1166,7 +1216,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding
 
 ### Parameters
 
-- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
+- `model`: name of the model to push in the form of `<namespace>/<model>:<tag>`
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
 
@@ -1176,7 +1226,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding
 
 ```shell
 curl http://localhost:11434/api/push -d '{
-  "name": "mattw/pygmalion:latest"
+  "model": "mattw/pygmalion:latest"
 }'
 ```
 

From e66c29261a8b8db6214ddebdc727e7b247be74df Mon Sep 17 00:00:00 2001
From: frob <rick+github@frob.com.au>
Date: Tue, 19 Nov 2024 17:33:52 +0100
Subject: [PATCH 036/106] Better error suppresion when getting terminal colours
 (#7739)

Co-authored-by: Richard Lyons <frob@cloudstaff.com>
---
 scripts/install.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/install.sh b/scripts/install.sh
index 850800a0..bc7b5f58 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -4,8 +4,8 @@
 
 set -eu
 
-red="$( (/usr/bin/tput bold; /usr/bin/tput setaf 1; :) 2>&-)"
-plain="$( (/usr/bin/tput sgr0; :) 2>&-)"
+red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
+plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
 
 status() { echo ">>> $*" >&2; }
 error() { echo "${red}ERROR:${plain} $*"; exit 1; }

From 4b8a2e341a9b4e713180b483f42316665c5faea3 Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Tue, 19 Nov 2024 15:05:57 -0800
Subject: [PATCH 037/106] server: allow mixed-case model names on push, pull,
 cp, and create (#7676)

This change allows for mixed-case model names to be pushed, pulled,
copied, and created, which was previously disallowed because the Ollama
registry was backed by a Docker registry that enforced a naming
convention that disallowed mixed-case names, which is no longer the
case.

This does not break existing, intended, behaviors.

Also, make TestCase test a story of creating, updating, pulling, and
copying a model with case variations, ensuring the model's manifest is
updated correctly, and not duplicated across different files with
different case variations.
---
 server/images.go      |  19 +++++
 server/routes.go      |  34 +++++---
 server/routes_test.go | 193 ++++++++++++++++++++++++++----------------
 types/model/name.go   |   7 ++
 4 files changed, 169 insertions(+), 84 deletions(-)

diff --git a/server/images.go b/server/images.go
index 584b7b13..6a0e8ae3 100644
--- a/server/images.go
+++ b/server/images.go
@@ -13,6 +13,7 @@ import (
 	"io"
 	"log"
 	"log/slog"
+	"net"
 	"net/http"
 	"net/url"
 	"os"
@@ -1071,6 +1072,21 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 	return nil, errUnauthorized
 }
 
+// testMakeRequestDialContext specifies the dial function for the http client in
+// makeRequest. It can be used to resolve hosts in model names to local
+// addresses for testing. For example, the model name ("example.com/my/model")
+// can be directed to push/pull from "127.0.0.1:1234".
+//
+// This is not safe to set across goroutines. It should be set in
+// the main test goroutine, and not by tests marked to run in parallel with
+// t.Parallel().
+//
+// It should be cleared after use, otherwise it will affect other tests.
+//
+// Ideally we would have some set this up the stack, but the code is not
+// structured in a way that makes this easy, so this will have to do for now.
+var testMakeRequestDialContext func(ctx context.Context, network, addr string) (net.Conn, error)
+
 func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *registryOptions) (*http.Response, error) {
 	if requestURL.Scheme != "http" && regOpts != nil && regOpts.Insecure {
 		requestURL.Scheme = "http"
@@ -1105,6 +1121,9 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
 	}
 
 	resp, err := (&http.Client{
+		Transport: &http.Transport{
+			DialContext: testMakeRequestDialContext,
+		},
 		CheckRedirect: regOpts.CheckRedirect,
 	}).Do(req)
 	if err != nil {
diff --git a/server/routes.go b/server/routes.go
index c5fd3293..f5b05bb5 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -540,7 +540,8 @@ func (s *Server) PullHandler(c *gin.Context) {
 		return
 	}
 
-	if err := checkNameExists(name); err != nil {
+	name, err = getExistingName(name)
+	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@@ -621,19 +622,20 @@ func (s *Server) PushHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }
 
-func checkNameExists(name model.Name) error {
-	names, err := Manifests(true)
+// getExistingName returns the original, on disk name if the input name is a
+// case-insensitive match, otherwise it returns the input name.
+func getExistingName(n model.Name) (model.Name, error) {
+	var zero model.Name
+	existing, err := Manifests(true)
 	if err != nil {
-		return err
+		return zero, err
 	}
-
-	for n := range names {
-		if strings.EqualFold(n.Filepath(), name.Filepath()) && n != name {
-			return errors.New("a model with that name already exists")
+	for e := range existing {
+		if n.EqualFold(e) {
+			return e, nil
 		}
 	}
-
-	return nil
+	return n, nil
 }
 
 func (s *Server) CreateHandler(c *gin.Context) {
@@ -652,7 +654,8 @@ func (s *Server) CreateHandler(c *gin.Context) {
 		return
 	}
 
-	if err := checkNameExists(name); err != nil {
+	name, err := getExistingName(name)
+	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@@ -958,14 +961,19 @@ func (s *Server) CopyHandler(c *gin.Context) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("source %q is invalid", r.Source)})
 		return
 	}
+	src, err := getExistingName(src)
+	if err != nil {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
 
 	dst := model.ParseName(r.Destination)
 	if !dst.IsValid() {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("destination %q is invalid", r.Destination)})
 		return
 	}
-
-	if err := checkNameExists(dst); err != nil {
+	dst, err = getExistingName(dst)
+	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
diff --git a/server/routes_test.go b/server/routes_test.go
index bd5b56af..1daf36f1 100644
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -7,13 +7,18 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
+	"io/fs"
 	"math"
+	"math/rand/v2"
+	"net"
 	"net/http"
 	"net/http/httptest"
 	"os"
+	"path/filepath"
 	"sort"
 	"strings"
 	"testing"
+	"unicode"
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
@@ -473,83 +478,129 @@ func Test_Routes(t *testing.T) {
 	}
 }
 
-func TestCase(t *testing.T) {
+func casingShuffle(s string) string {
+	rr := []rune(s)
+	for i := range rr {
+		if rand.N(2) == 0 {
+			rr[i] = unicode.ToUpper(rr[i])
+		} else {
+			rr[i] = unicode.ToLower(rr[i])
+		}
+	}
+	return string(rr)
+}
+
+func TestManifestCaseSensitivity(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", t.TempDir())
 
-	cases := []string{
-		"mistral",
-		"llama3:latest",
-		"library/phi3:q4_0",
-		"registry.ollama.ai/library/gemma:q5_K_M",
-		// TODO: host:port currently fails on windows (#4107)
-		// "localhost:5000/alice/bob:latest",
+	r := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+		io.WriteString(w, `{}`) //nolint:errcheck
+	}))
+	defer r.Close()
+
+	nameUsed := make(map[string]bool)
+	name := func() string {
+		const fqmn = "example/namespace/model:tag"
+		for {
+			v := casingShuffle(fqmn)
+			if nameUsed[v] {
+				continue
+			}
+			nameUsed[v] = true
+			return v
+		}
+	}
+
+	wantStableName := name()
+
+	// checkManifestList tests that there is strictly one manifest in the
+	// models directory, and that the manifest is for the model under test.
+	checkManifestList := func() {
+		t.Helper()
+
+		mandir := filepath.Join(os.Getenv("OLLAMA_MODELS"), "manifests/")
+		var entries []string
+		t.Logf("dir entries:")
+		fsys := os.DirFS(mandir)
+		err := fs.WalkDir(fsys, ".", func(path string, info fs.DirEntry, err error) error {
+			if err != nil {
+				return err
+			}
+			t.Logf("    %s", fs.FormatDirEntry(info))
+			if info.IsDir() {
+				return nil
+			}
+			path = strings.TrimPrefix(path, mandir)
+			entries = append(entries, path)
+			return nil
+		})
+		if err != nil {
+			t.Fatalf("failed to walk directory: %v", err)
+		}
+
+		if len(entries) != 1 {
+			t.Errorf("len(got) = %d, want 1", len(entries))
+			return // do not use Fatal so following steps run
+		}
+
+		g := entries[0] // raw path
+		g = filepath.ToSlash(g)
+		w := model.ParseName(wantStableName).Filepath()
+		w = filepath.ToSlash(w)
+		if g != w {
+			t.Errorf("\ngot:  %s\nwant: %s", g, w)
+		}
+	}
+
+	checkOK := func(w *httptest.ResponseRecorder) {
+		t.Helper()
+		if w.Code != http.StatusOK {
+			t.Errorf("code = %d, want 200", w.Code)
+			t.Logf("body: %s", w.Body.String())
+		}
 	}
 
 	var s Server
-	for _, tt := range cases {
-		t.Run(tt, func(t *testing.T) {
-			w := createRequest(t, s.CreateHandler, api.CreateRequest{
-				Name:      tt,
-				Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
-				Stream:    &stream,
-			})
-
-			if w.Code != http.StatusOK {
-				t.Fatalf("expected status 200 got %d", w.Code)
-			}
-
-			expect, err := json.Marshal(map[string]string{"error": "a model with that name already exists"})
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			t.Run("create", func(t *testing.T) {
-				w = createRequest(t, s.CreateHandler, api.CreateRequest{
-					Name:      strings.ToUpper(tt),
-					Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
-					Stream:    &stream,
-				})
-
-				if w.Code != http.StatusBadRequest {
-					t.Fatalf("expected status 500 got %d", w.Code)
-				}
-
-				if !bytes.Equal(w.Body.Bytes(), expect) {
-					t.Fatalf("expected error %s got %s", expect, w.Body.String())
-				}
-			})
-
-			t.Run("pull", func(t *testing.T) {
-				w := createRequest(t, s.PullHandler, api.PullRequest{
-					Name:   strings.ToUpper(tt),
-					Stream: &stream,
-				})
-
-				if w.Code != http.StatusBadRequest {
-					t.Fatalf("expected status 500 got %d", w.Code)
-				}
-
-				if !bytes.Equal(w.Body.Bytes(), expect) {
-					t.Fatalf("expected error %s got %s", expect, w.Body.String())
-				}
-			})
-
-			t.Run("copy", func(t *testing.T) {
-				w := createRequest(t, s.CopyHandler, api.CopyRequest{
-					Source:      tt,
-					Destination: strings.ToUpper(tt),
-				})
-
-				if w.Code != http.StatusBadRequest {
-					t.Fatalf("expected status 500 got %d", w.Code)
-				}
-
-				if !bytes.Equal(w.Body.Bytes(), expect) {
-					t.Fatalf("expected error %s got %s", expect, w.Body.String())
-				}
-			})
-		})
+	testMakeRequestDialContext = func(ctx context.Context, _, _ string) (net.Conn, error) {
+		var d net.Dialer
+		return d.DialContext(ctx, "tcp", r.Listener.Addr().String())
 	}
+	t.Cleanup(func() { testMakeRequestDialContext = nil })
+
+	t.Logf("creating")
+	checkOK(createRequest(t, s.CreateHandler, api.CreateRequest{
+		// Start with the stable name, and later use a case-shuffled
+		// version.
+		Name: wantStableName,
+
+		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
+		Stream:    &stream,
+	}))
+	checkManifestList()
+
+	t.Logf("creating (again)")
+	checkOK(createRequest(t, s.CreateHandler, api.CreateRequest{
+		Name:      name(),
+		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
+		Stream:    &stream,
+	}))
+	checkManifestList()
+
+	t.Logf("pulling")
+	checkOK(createRequest(t, s.PullHandler, api.PullRequest{
+		Name:     name(),
+		Stream:   &stream,
+		Insecure: true,
+	}))
+	checkManifestList()
+
+	t.Logf("copying")
+	checkOK(createRequest(t, s.CopyHandler, api.CopyRequest{
+		Source:      name(),
+		Destination: name(),
+	}))
+	checkManifestList()
 }
 
 func TestShow(t *testing.T) {
diff --git a/types/model/name.go b/types/model/name.go
index 75b35ef7..9d819f10 100644
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -298,6 +298,13 @@ func (n Name) LogValue() slog.Value {
 	return slog.StringValue(n.String())
 }
 
+func (n Name) EqualFold(o Name) bool {
+	return strings.EqualFold(n.Host, o.Host) &&
+		strings.EqualFold(n.Namespace, o.Namespace) &&
+		strings.EqualFold(n.Model, o.Model) &&
+		strings.EqualFold(n.Tag, o.Tag)
+}
+
 func isValidLen(kind partKind, s string) bool {
 	switch kind {
 	case kindHost:

From 807ace5b1f4fc9de7347297b3c8a695c566d9fd9 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Tue, 19 Nov 2024 15:58:14 -0700
Subject: [PATCH 038/106] fix(runner): Set logits to 0 if false on Batch.Add

https://github.com/ollama/ollama/issues/7656
Branch: Granite3StoppingBug-7656

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 llama/llama.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llama/llama.go b/llama/llama.go
index 72b8b691..a3fbbc1d 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -384,6 +384,8 @@ func (b *Batch) Add(token int, embed []float32, pos int, logits bool, seqIds ...
 
 	if logits {
 		unsafe.Slice(b.c.logits, b.allocSize())[b.c.n_tokens] = 1
+	} else {
+		unsafe.Slice(b.c.logits, b.allocSize())[b.c.n_tokens] = 0
 	}
 
 	b.c.n_tokens += 1

From f602ab4de44756ebffa74a1e3c6e6bd0f3febab3 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 19 Nov 2024 16:26:05 -0800
Subject: [PATCH 039/106] expose underlying error on embedding failure (#7743)

Avoid a round-trip asking users for logs to see what went wrong.
---
 server/routes.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/routes.go b/server/routes.go
index f5b05bb5..5dfd6ffe 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -507,7 +507,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	embedding, err := r.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
 		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
-		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
+		c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Errorf("failed to generate embedding: %v", err)})
 		return
 	}
 

From 909a88c5c0242d2dbaeb4b07ff643a9b6b6bada0 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 19 Nov 2024 16:26:57 -0800
Subject: [PATCH 040/106] Improve crash reporting (#7728)

Many model crashes are masked behind "An existing connection was forcibly closed by the remote host"
This captures that common error message and wires in any detected errors from the log.

This also adds the deepseek context shift error to the known errors we capture.
---
 llm/server.go | 8 +++++---
 llm/status.go | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/llm/server.go b/llm/server.go
index 624acbf8..d7c5198d 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -838,13 +838,15 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	}
 
 	if err := scanner.Err(); err != nil {
-		if strings.Contains(err.Error(), "unexpected EOF") {
+		if strings.Contains(err.Error(), "unexpected EOF") || strings.Contains(err.Error(), "forcibly closed") {
 			s.Close()
-			msg := ""
+			var msg string
 			if s.status != nil && s.status.LastErrMsg != "" {
 				msg = s.status.LastErrMsg
+			} else {
+				msg = err.Error()
 			}
-			return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
+			return fmt.Errorf("an error was encountered while running the model: %s", msg)
 		}
 
 		return fmt.Errorf("error reading llm response: %v", err)
diff --git a/llm/status.go b/llm/status.go
index 604fe9e0..80f44e65 100644
--- a/llm/status.go
+++ b/llm/status.go
@@ -27,6 +27,7 @@ var errorPrefixes = []string{
 	"\"ERR\"",
 	"error loading model",
 	"GGML_ASSERT",
+	"Deepseek2 does not support K-shift",
 }
 
 func (w *StatusWriter) Write(b []byte) (int, error) {

From 0ef17ede89d01126dead58c81aafa4c63be233e3 Mon Sep 17 00:00:00 2001
From: Jonathan Hecl <jonathanhecl@gmail.com>
Date: Wed, 20 Nov 2024 02:31:43 -0300
Subject: [PATCH 041/106] readme: add Gollama to community integrations (#7756)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index e51be1e4..35282d31 100644
--- a/README.md
+++ b/README.md
@@ -416,6 +416,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
 - [LlamaScript](https://github.com/Project-Llama/llamascript)
 - [Gollm](https://docs.gollm.co/examples/ollama-example)
+- [Gollama for Golang](https://github.com/jonathanhecl/gollama)
 - [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)
 - [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
 - [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)

From bfd30f428682cca87f8dcd953fdd2af754a19f89 Mon Sep 17 00:00:00 2001
From: Gordon Kamer <gkamer@outlook.com>
Date: Tue, 19 Nov 2024 21:37:15 -0800
Subject: [PATCH 042/106] readme: add Abbey to community integrations (#7746)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 35282d31..fe707177 100644
--- a/README.md
+++ b/README.md
@@ -338,6 +338,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
 - [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
 - [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux) 
+- [Abbey](https://github.com/US-Artificial-Intelligence/abbey) (A configurable AI interface server with notebooks, document storage, and YouTube support)
 
 ### Terminal
 

From 2f0a8c8778c9258dbe44c2d04b85fac82e3bf7b9 Mon Sep 17 00:00:00 2001
From: rohitanshu <85547195+iamrohitanshu@users.noreply.github.com>
Date: Wed, 20 Nov 2024 23:27:32 +0530
Subject: [PATCH 043/106] docs: fix minor typo in import.md (#7764)

change 'containg' to 'containing'
---
 docs/import.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/import.md b/docs/import.md
index b90377bf..040fa299 100644
--- a/docs/import.md
+++ b/docs/import.md
@@ -81,7 +81,7 @@ If you have a GGUF based model or adapter it is possible to import it into Ollam
   * converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
   * downloading a model or adapter from a place such as HuggingFace
 
-To import a GGUF model, create a `Modelfile` containg:
+To import a GGUF model, create a `Modelfile` containing:
 
 ```dockerfile
 FROM /path/to/file.gguf

From d2a25206b1dc4b13796bfdc2182f7e683ca8170c Mon Sep 17 00:00:00 2001
From: Adarsh Mishra <95633830+adarshM84@users.noreply.github.com>
Date: Thu, 21 Nov 2024 00:12:55 +0530
Subject: [PATCH 044/106] readme: add opentalkgpt to community integrations
 (#7707)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index fe707177..00d6a200 100644
--- a/README.md
+++ b/README.md
@@ -336,6 +336,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
+- [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt)
 - [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
 - [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux) 
 - [Abbey](https://github.com/US-Artificial-Intelligence/abbey) (A configurable AI interface server with notebooks, document storage, and YouTube support)

From 303f4bc79e2b38e0f5456304aa828c61d21b7d42 Mon Sep 17 00:00:00 2001
From: thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
Date: Wed, 20 Nov 2024 20:45:10 +0200
Subject: [PATCH 045/106] readme: add vibe app to community integrations
 (#7607)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 00d6a200..962d18a9 100644
--- a/README.md
+++ b/README.md
@@ -440,6 +440,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Raycast extension](https://github.com/MassimilianoPasquini97/raycast_ollama)
 - [Discollama](https://github.com/mxyng/discollama) (Discord bot inside the Ollama discord channel)
 - [Continue](https://github.com/continuedev/continue)
+- [Vibe](https://github.com/thewh1teagle/vibe) (Transcribe and analyze meetings with Ollama)
 - [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
 - [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
 - [NotesOllama](https://github.com/andersrex/notesollama) (Apple Notes Ollama plugin)

From b8c66d33070e2c7bf2fb2057c260742db11e15b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcus=20Ziad=C3=A9?= <guitaripod@icloud.com>
Date: Wed, 20 Nov 2024 20:49:15 +0200
Subject: [PATCH 046/106] readme: add a swift community integration (#7383)

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 962d18a9..c3e02369 100644
--- a/README.md
+++ b/README.md
@@ -366,6 +366,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
 - [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
 - [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
+- [SwollamaCLI](https://github.com/marcusziade/Swollama) bundled with the Swollama Swift package. [Demo](https://github.com/marcusziade/Swollama?tab=readme-ov-file#cli-usage)
 - [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
 
 ### Apple Vision Pro
@@ -424,6 +425,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
 - [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
+- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
 - [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)

From ecf41eed0595fb031f1addc179f6abb86d8405f8 Mon Sep 17 00:00:00 2001
From: Emir Sahin <50391065+emirsahin1@users.noreply.github.com>
Date: Wed, 20 Nov 2024 13:53:14 -0500
Subject: [PATCH 047/106] readme: add llm-axe to community integrations (#5931)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index c3e02369..b158c182 100644
--- a/README.md
+++ b/README.md
@@ -418,6 +418,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
 - [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
 - [LlamaScript](https://github.com/Project-Llama/llamascript)
+- [llm-axe](https://github.com/emirsahin1/llm-axe) (Python Toolkit for Building LLM Powered Apps)
 - [Gollm](https://docs.gollm.co/examples/ollama-example)
 - [Gollama for Golang](https://github.com/jonathanhecl/gollama)
 - [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)

From 5f68fcab127b157b2886c22e02585b600adfe7b4 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 19 Nov 2024 16:41:42 -0800
Subject: [PATCH 048/106] runner.go: Use correct index when retrieving
 embedding results

This doesn't have any impact currently because NUM_PARALLEL is forced
to 1 for embeddings, so both indicies will always be 0.
---
 llama/runner/runner.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index a2da546f..b8fc7cc6 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -454,7 +454,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 
 		// if done processing the prompt, generate an embedding and return
 		if seq.embeddingOnly {
-			embed := s.lc.GetEmbeddingsSeq(i)
+			embed := s.lc.GetEmbeddingsSeq(seq.cache.Id)
 			if embed == nil {
 				embed = s.lc.GetEmbeddingsIth(seq.iBatch)
 			}

From 7121dfa309c297f2dc1e9f6f69ab5437a4f1be66 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 19 Nov 2024 11:00:41 -0800
Subject: [PATCH 049/106] runner.go: Retry decoding after defragmentation if
 needed

Fragmentation of the KV cache can occur due to cache shifting or
different sequences getting processed. Decode uses a heuristic to
decide if it should defrag. However, this heuristic isn't 100%
accurate, so decoding can sometimes fail by surprise.

For these cases, if decode indicates that there is no KV cache space,
we should defrag and then try again.
---
 integration/context_test.go | 31 +++++++++++++++++++++++++++++++
 llama/llama.go              | 14 ++++++++++----
 llama/runner/runner.go      | 11 +++++++++--
 3 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/integration/context_test.go b/integration/context_test.go
index f1342e16..add41a76 100644
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -10,7 +10,38 @@ import (
 	"github.com/ollama/ollama/api"
 )
 
+func TestLongInputContext(t *testing.T) {
+	// Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what
+	// we asked for and there is nothing extra that we could spill over into
+	t.Setenv("OLLAMA_NUM_PARALLEL", "1")
+
+	// Longer needed for small footprint GPUs
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+	// Set up the test data
+	req := api.GenerateRequest{
+		Model:  "llama2",
+		Prompt: "Oh, don’t speak to me of Austria. Perhaps I don’t understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexander’s loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I don’t believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe! What country is this referring to?",
+		Stream: &stream,
+		Options: map[string]interface{}{
+			"temperature": 0,
+			"seed":        123,
+			"num_ctx":     128,
+		},
+	}
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatalf("PullIfMissing failed: %v", err)
+	}
+	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia"}, 120*time.Second, 10*time.Second)
+}
+
 func TestContextExhaustion(t *testing.T) {
+	// Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what
+	// we asked for and there is nothing extra that we could spill over into
+	t.Setenv("OLLAMA_NUM_PARALLEL", "1")
+
 	// Longer needed for small footprint GPUs
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
diff --git a/llama/llama.go b/llama/llama.go
index a3fbbc1d..468540f5 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -157,9 +157,7 @@ type Context struct {
 	numThreads int
 }
 
-func (c *Context) KvCacheClear() {
-	C.llama_kv_cache_clear(c.c)
-}
+var ErrKvCacheFull = errors.New("could not find a kv cache slot")
 
 func (c *Context) Decode(batch *Batch) error {
 	// Positive return values does not mean a fatal error, but rather a warning.
@@ -173,7 +171,7 @@ func (c *Context) Decode(batch *Batch) error {
 	}
 
 	if code > 0 {
-		return fmt.Errorf("could not find a KV slot for the batch - try reducing the size of the batch or increase the context. code: %d", code)
+		return ErrKvCacheFull
 	}
 
 	return nil
@@ -195,6 +193,14 @@ func (c *Context) KvCacheSeqCp(srcSeqId int, dstSeqId int, p0 int, p1 int) {
 	C.llama_kv_cache_seq_cp(c.c, C.int(srcSeqId), C.int(dstSeqId), C.int(p0), C.int(p1))
 }
 
+func (c *Context) KvCacheClear() {
+	C.llama_kv_cache_clear(c.c)
+}
+
+func (c *Context) KvCacheDefrag() {
+	C.llama_kv_cache_defrag(c.c)
+}
+
 // Get the embeddings for a sequence id
 func (c *Context) GetEmbeddingsSeq(seqId int) []float32 {
 	embeddings := unsafe.Pointer(C.llama_get_embeddings_seq(c.c, C.int(seqId)))
diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index b8fc7cc6..a41573ae 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -426,8 +426,15 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 
 	err := s.lc.Decode(batch)
 	if err != nil {
-		slog.Error("failed to decode batch", "error", err)
-		return
+		if errors.Is(err, llama.ErrKvCacheFull) {
+			slog.Debug("defragmenting kv cache")
+			s.cache.lc.KvCacheDefrag()
+			err = s.lc.Decode(batch)
+		}
+		if err != nil {
+			slog.Error("failed to decode batch", "error", err)
+			return
+		}
 	}
 
 	if crossAttention {

From 3fc1dc0e6f32a22063db22a4dc72a75f8411a663 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 19 Nov 2024 10:55:29 -0800
Subject: [PATCH 050/106] runner.go: Hard fail on errors rather than
 potentially infinite looping

We try to recover from errors by dropping the tokens that caused the
problem and re-trying. However, dropping the tokens is not correct
and continuing often leads to infinite loops. To avoid, this we
end the sequence if such a condition is detected, which is also
surprising.

At this point, it is better to just report the error. This will make
it easier to find problems and the alternatives are perhaps even more
surprising to users.

This is not a very satisfactory solution either - we should isolate
the error and return it to the user without killing the whole process.
However, this is an incremental step and consistent with most other
failures (which either manifest as abort() or panic).
---
 llama/runner/runner.go | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index a41573ae..a38cce91 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -324,7 +324,11 @@ func (s *Server) run(ctx context.Context) {
 		case <-ctx.Done():
 			return
 		default:
-			s.processBatch(tokenBatch, embedBatch)
+			err := s.processBatch(tokenBatch, embedBatch)
+			if err != nil {
+				panic(err)
+			}
+
 			tokenBatch.Clear()
 			embedBatch.Clear()
 		}
@@ -338,7 +342,7 @@ func (s *Server) run(ctx context.Context) {
 // these should instead be handled by the handlers
 // it should only be responsible for accepting tokens or embeddings and
 // processing batches as fast as possible
-func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) {
+func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) error {
 	s.mu.Lock()
 	for s.allNil() {
 		s.cond.Wait() // Wait until an item is added
@@ -357,14 +361,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}
 
-		// If an error occurred during the processing of a previous batch then we may have emptied the inputs
-		// without adding a new one. In this case, end the sequence rather than infinite looping.
-		if len(seq.inputs) == 0 {
-			slog.Error("removing sequence due to no input tokens", "index", seqIdx, "cache id", seq.cache.Id)
-			s.removeSequence(seqIdx, "error")
-			continue
-		}
-
 		// if past the num predict limit
 		if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
 			s.removeSequence(seqIdx, "limit")
@@ -419,7 +415,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 	}
 
 	if batch == nil || batch.NumTokens() == 0 {
-		return
+		return nil
 	}
 
 	s.lc.SetCrossAttention(crossAttention)
@@ -432,8 +428,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			err = s.lc.Decode(batch)
 		}
 		if err != nil {
-			slog.Error("failed to decode batch", "error", err)
-			return
+			return fmt.Errorf("failed to decode batch: %w", err)
 		}
 	}
 
@@ -531,6 +526,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			s.removeSequence(i, "connection")
 		}
 	}
+
+	return nil
 }
 
 // TODO (jmorganca): use structs from the api package to avoid duplication

From c3ff9164317940ec09534fd2370ec604a0de32ad Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 19 Nov 2024 10:51:47 -0800
Subject: [PATCH 051/106] runner.go: Don't add inputs to cache view until
 actually processed

We need to track which tokens are in the cache ourselves. We currently
add tokens to the cache tracker when we add them to batch but they are
not actually in the cache until we call Decode. This can cause
confusion when we are shifting the cache.

Avoids "could not find a KV slot for the batch" issues.

Bug #7545
---
 llama/runner/cache.go  | 16 ++++++++++++----
 llama/runner/runner.go | 33 +++++++++++++++++++--------------
 2 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/llama/runner/cache.go b/llama/runner/cache.go
index 190ccdff..b487fe25 100644
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@@ -203,7 +203,11 @@ func countCommonPrefix(a []input, b []input) int {
 // the newest half into that space (saving numKeep inputs at the beginning).
 //
 // Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
-func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) {
+func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) error {
+	if numKeep >= c.numCtx {
+		return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
+	}
+
 	targetFree := (c.numCtx - numKeep) / 2
 	targetFree = max(targetFree, 1)
 
@@ -211,18 +215,22 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) {
 	discard := targetFree - currentFree
 
 	if discard <= 0 {
-		return
+		return nil
 	}
 
-	slog.Debug("context limit hit - shifting", "limit", c.numCtx, "input", len(slot.Inputs),
+	slog.Debug("context limit hit - shifting", "id", slot.Id, "limit", c.numCtx, "input", len(slot.Inputs),
 		"keep", numKeep, "discard", discard)
 
 	// TODO (jessegross): KV cache removal can fail for certain types of models
-	c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard)
+	if !c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard) {
+		return fmt.Errorf("unable to remove old kv cache entries (id: %v, keep: %v discard: %v)", slot.Id, numKeep, discard)
+	}
 	c.lc.KvCacheSeqAdd(slot.Id, numKeep+discard, len(slot.Inputs), -discard)
 
 	for i := numKeep + discard; i < len(slot.Inputs); i++ {
 		slot.Inputs[i-discard] = slot.Inputs[i]
 	}
 	slot.Inputs = slot.Inputs[:len(slot.Inputs)-discard]
+
+	return nil
 }
diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index a38cce91..1ed25c27 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -45,6 +45,9 @@ type Sequence struct {
 	// prompt inputs left to evaluate
 	inputs []input
 
+	// inputs that have been added to a batch but not yet submitted to Decode
+	pendingInputs []input
+
 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
 	pendingResponses []string
 
@@ -367,14 +370,13 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}
 
-		var numInputsProcessed int
-		shifted := false
-
 		for i, input := range seq.inputs {
-			if len(seq.cache.Inputs)+1 > s.cache.numCtx {
-				if !shifted {
-					s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
-					shifted = true
+			if len(seq.cache.Inputs)+len(seq.pendingInputs)+1 > s.cache.numCtx {
+				if len(seq.pendingInputs) == 0 {
+					err := s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
+					if err != nil {
+						return err
+					}
 				} else {
 					break
 				}
@@ -403,15 +405,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			}
 
 			crossAttention = seq.crossAttention
-			batch.Add(input.token, input.embed, len(seq.cache.Inputs), i+1 == len(seq.inputs), seq.cache.Id)
-			seq.cache.Inputs = append(seq.cache.Inputs, input)
-			numInputsProcessed++
-		}
-
-		if numInputsProcessed > 0 {
-			seq.inputs = seq.inputs[numInputsProcessed:]
+			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
+			seq.pendingInputs = append(seq.pendingInputs, input)
 			seq.iBatch = batch.NumTokens() - 1
 		}
+
+		seq.inputs = seq.inputs[len(seq.pendingInputs):]
 	}
 
 	if batch == nil || batch.NumTokens() == 0 {
@@ -444,6 +443,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}
 
+		// After calling Decode, pending inputs are now in the cache
+		if len(seq.pendingInputs) > 0 {
+			seq.cache.Inputs = append(seq.cache.Inputs, seq.pendingInputs...)
+			seq.pendingInputs = []input{}
+		}
+
 		// don't sample prompt processing
 		if len(seq.inputs) != 0 {
 			continue

From c4b34f2a2af5ce3fe7b05ae2d3334e155029ce6b Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 20 Nov 2024 10:39:56 -0800
Subject: [PATCH 052/106] runner.go: Truncate inputs that exceed context rather
 than shifting

Previous versions of the runner would truncate inputs to the context
window before beginning processing. The main processing loop relied
on this behavior if the context needed to be shifted later (due to
token generation). If truncation did not occur then invariants
would be broken, causing crashes or infinite loops.

Later versions attempted to fix these bugs and make the logic less
subtle so that all inputs could be handled. Truncation was removed
to make things consistent.

However, truncation is much faster than processing and shifting, so
removing it caused performance problems when the input vastly exceeded
the context size. This restores the input truncation as a performance
optimization while keeping the more robust processing logic.

Fixes #7762
---
 llama/runner/runner.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index 1ed25c27..c7662b33 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -122,7 +122,10 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 	params.numKeep = min(params.numKeep, s.cache.numCtx-1)
 
 	if len(inputs) > s.cache.numCtx {
-		slog.Warn("input exceeds context length", "prompt", len(inputs), "limit", s.cache.numCtx)
+		slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep)
+		newInputs := inputs[:params.numKeep]
+		newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...)
+		inputs = newInputs
 	}
 
 	var sc *llama.SamplingContext

From d8632982102ac225349f8c62235ac56de8c63531 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 20 Nov 2024 16:00:46 -0800
Subject: [PATCH 053/106] docs: Link to AMD guide on multi-GPU guidance (#7744)

---
 docs/troubleshooting.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 3400b4e8..24e8e962 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -104,6 +104,12 @@ If you are experiencing problems getting Ollama to correctly discover or use you
 - `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
 - Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`
 
+## Multiple AMD GPUs
+
+If you experience gibberish responses when models load across multiple AMD GPUs on Linux, see the following guide.
+
+- https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/mgpu.html#mgpu-known-issues-and-limitations
+
 ## Windows Terminal Errors
 
 Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly.  This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect`  To resolve this problem, please update to Win 10 22H1 or newer.

From fce30f407a39804867c01b599953c221c17025ac Mon Sep 17 00:00:00 2001
From: Nikita Ganzikov <nganzikov@gmail.com>
Date: Thu, 21 Nov 2024 09:01:58 +0300
Subject: [PATCH 054/106] app: typo in wintray messages const (#7705)

---
 app/tray/wintray/menus.go    | 2 +-
 app/tray/wintray/messages.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/app/tray/wintray/menus.go b/app/tray/wintray/menus.go
index 9421b489..0b13d7cb 100644
--- a/app/tray/wintray/menus.go
+++ b/app/tray/wintray/menus.go
@@ -39,7 +39,7 @@ func (t *winTray) UpdateAvailable(ver string) error {
 		if err := t.addOrUpdateMenuItem(updateAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
 			return fmt.Errorf("unable to create menu entries %w", err)
 		}
-		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
+		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenuTitle, false); err != nil {
 			return fmt.Errorf("unable to create menu entries %w", err)
 		}
 		if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
diff --git a/app/tray/wintray/messages.go b/app/tray/wintray/messages.go
index d364c716..64a47855 100644
--- a/app/tray/wintray/messages.go
+++ b/app/tray/wintray/messages.go
@@ -10,6 +10,6 @@ const (
 
 	quitMenuTitle            = "Quit Ollama"
 	updateAvailableMenuTitle = "An update is available"
-	updateMenutTitle         = "Restart to update"
+	updateMenuTitle          = "Restart to update"
 	diagLogsMenuTitle        = "View logs"
 )

From c5e238e8e53fe26a056854c94bac20377d3185b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20F=2E=20R=C3=B8dseth?=
 <52813+xyproto@users.noreply.github.com>
Date: Thu, 21 Nov 2024 08:24:05 +0100
Subject: [PATCH 055/106] readme: orbiton to community integrations (#7770)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b158c182..d5d1d95e 100644
--- a/README.md
+++ b/README.md
@@ -368,6 +368,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
 - [SwollamaCLI](https://github.com/marcusziade/Swollama) bundled with the Swollama Swift package. [Demo](https://github.com/marcusziade/Swollama?tab=readme-ov-file#cli-usage)
 - [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
+- [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
 
 ### Apple Vision Pro
 - [Enchanted](https://github.com/AugustDev/enchanted)

From 6a89dcf848b1d041c7c9959dfc46c8c9b037df89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A5=B6=E8=8C=B6=E5=8F=94=E5=8F=94?=
 <43770875+anan1213095357@users.noreply.github.com>
Date: Thu, 21 Nov 2024 15:30:10 +0800
Subject: [PATCH 056/106] readme: flutter-based chat app to community
 integrations (#7221)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index d5d1d95e..4156c6ff 100644
--- a/README.md
+++ b/README.md
@@ -333,6 +333,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
+- [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
 - [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)

From 1a742f54c91a444295342cdaead2f3c4e5460de6 Mon Sep 17 00:00:00 2001
From: boessu <boessu@users.noreply.github.com>
Date: Thu, 21 Nov 2024 08:48:55 +0100
Subject: [PATCH 057/106] readme: update AMD ROCm links (#7213)

---
 llama/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama/README.md b/llama/README.md
index ec54b989..79bf4fde 100644
--- a/llama/README.md
+++ b/llama/README.md
@@ -55,7 +55,7 @@ go build -tags avx,cuda .
 
 ### ROCm
 
-Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive):
+Install [ROCm](https://rocm.docs.amd.com/en/latest/).
 
 ```shell
 make ggml_hipblas.so
@@ -77,7 +77,7 @@ go build -tags avx,cuda .
 
 ### ROCm
 
-Install [ROCm 5.7.1](https://rocm.docs.amd.com/en/docs-5.7.1/).
+Install [ROCm](https://rocm.docs.amd.com/en/latest/).
 
 ```shell
 make ggml_hipblas.dll

From 7e9209175188c639817d149fb6a51e8db82631d5 Mon Sep 17 00:00:00 2001
From: drunkwcodes <36228443+drunkwcodes@users.noreply.github.com>
Date: Thu, 21 Nov 2024 16:03:11 +0800
Subject: [PATCH 058/106] readme: Terminal app bb7 to community integrations
 (#7064)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 4156c6ff..d7c48aac 100644
--- a/README.md
+++ b/README.md
@@ -367,6 +367,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
 - [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
 - [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
+- [bb7](https://github.com/drunkwcodes/bb7)
 - [SwollamaCLI](https://github.com/marcusziade/Swollama) bundled with the Swollama Swift package. [Demo](https://github.com/marcusziade/Swollama?tab=readme-ov-file#cli-usage)
 - [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.

From 0e5f31a86d960ca5d585a14d5e69e7173c276f35 Mon Sep 17 00:00:00 2001
From: Andy Gill <andygillku@gmail.com>
Date: Thu, 21 Nov 2024 02:11:39 -0600
Subject: [PATCH 059/106] readme: add Haverscript to community integrations
 (#6945)

Haverscript uses classical functional programming techniques to provide a composable interface for interacting with ollama-hosted LLMs.
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index d7c48aac..c1878571 100644
--- a/README.md
+++ b/README.md
@@ -428,6 +428,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
 - [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
 - [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
+- [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
 - [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
 - [GoLamify](https://github.com/prasad89/golamify)

From 20623cec13890f015cf02d707fb66a710b808a61 Mon Sep 17 00:00:00 2001
From: Laurent Eschenauer <laurent@eschenauer.be>
Date: Thu, 21 Nov 2024 09:13:54 +0100
Subject: [PATCH 060/106] readme: add OpenGPA to community integrations (#5497)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index c1878571..095e2496 100644
--- a/README.md
+++ b/README.md
@@ -313,6 +313,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
 - [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
+- [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application) 
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)

From 3f87f71755910c1c1a2c260aa23a48cea58c942e Mon Sep 17 00:00:00 2001
From: Nico <1622112+nicarq@users.noreply.github.com>
Date: Thu, 21 Nov 2024 02:16:18 -0600
Subject: [PATCH 061/106] readme: add Shinkai Desktop to community integrations
 (#4877)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 095e2496..3acda521 100644
--- a/README.md
+++ b/README.md
@@ -311,6 +311,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
+- [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
 - [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
 - [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application) 

From b7aa5ee06c6b4eb3725aaaa5fa7de4a7ce4bd412 Mon Sep 17 00:00:00 2001
From: chyok <chyok@hotmail.com>
Date: Thu, 21 Nov 2024 16:19:24 +0800
Subject: [PATCH 062/106] readme: Add tkinter-based client to community based
 integrations (#5412)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 3acda521..30d4d35a 100644
--- a/README.md
+++ b/README.md
@@ -331,6 +331,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
+- [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)

From c4f27225acd196d0cba3957419834de393b4ecd0 Mon Sep 17 00:00:00 2001
From: Kevin Brake <elearningshow@users.noreply.github.com>
Date: Thu, 21 Nov 2024 05:01:27 -0330
Subject: [PATCH 063/106] readme: add community contribution to readme
 ollama-kis (#5575)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 30d4d35a..24d61e53 100644
--- a/README.md
+++ b/README.md
@@ -314,6 +314,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
 - [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
+- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education) 
 - [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application) 
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)

From 431075fcbbd1f93c0e39730b3beba98f6bfd6d51 Mon Sep 17 00:00:00 2001
From: Aarushi <50577581+aarushik93@users.noreply.github.com>
Date: Thu, 21 Nov 2024 08:51:38 +0000
Subject: [PATCH 064/106] readme: add autogpt integration to list of community
 integrations (#6459)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 24d61e53..db8e9bfa 100644
--- a/README.md
+++ b/README.md
@@ -323,6 +323,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
+- [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
 - [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
 - [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j

From 811bafba8243fbc246470ea4b4fbd07bf04d1564 Mon Sep 17 00:00:00 2001
From: Franco Lombardo <f.lombardo69@gmail.com>
Date: Thu, 21 Nov 2024 09:54:26 +0100
Subject: [PATCH 065/106] readme: Add LLPhant to community integrations (#5679)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index db8e9bfa..82adc74d 100644
--- a/README.md
+++ b/README.md
@@ -402,6 +402,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
+- [LLPhant](https://github.com/theodo-group/LLPhant?tab=readme-ov-file#ollama)
 - [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/llm/ollama/) and [LlamaIndexTS](https://ts.llamaindex.ai/modules/llms/available_llms/ollama)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)

From f5ec7cc87281e77db80022ef4658188f12584555 Mon Sep 17 00:00:00 2001
From: Dezoito <dezoito@users.noreply.github.com>
Date: Thu, 21 Nov 2024 06:02:46 -0300
Subject: [PATCH 066/106] readme: add ollama grid search, a community project
 (#4301)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 82adc74d..e54b2b3a 100644
--- a/README.md
+++ b/README.md
@@ -308,6 +308,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
 - [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
 - [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)
+- [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
@@ -489,4 +490,3 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Supported backends
 
 - [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
-

From e4c9f75b2326deea746cdf72ffb0c7531be8235e Mon Sep 17 00:00:00 2001
From: Jakub Burkiewicz <jakubburkiewicz@users.noreply.github.com>
Date: Thu, 21 Nov 2024 10:09:37 +0100
Subject: [PATCH 067/106] readme: add node-red-contrib-ollama to community
 integrations (#4648)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index e54b2b3a..696131ab 100644
--- a/README.md
+++ b/README.md
@@ -479,6 +479,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
+- [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
 - [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
 - [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)

From 883d80e0972df1f12a881fe1ba742c361f64e5e7 Mon Sep 17 00:00:00 2001
From: Michael <michaelm.github@gmail.com>
Date: Thu, 21 Nov 2024 10:46:20 +0100
Subject: [PATCH 068/106] readme: add Promptery to community integrations
 (#7093)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 696131ab..be01628f 100644
--- a/README.md
+++ b/README.md
@@ -338,6 +338,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
+- [Promptery](https://github.com/promptery/promptery) (desktop client for Ollama.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
 - [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
 - [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)

From 155734e09ae066efe26bca19d015ead10ea9d99b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcin=20Szczygli=C5=84ski?=
 <61396542+szczyglis-dev@users.noreply.github.com>
Date: Thu, 21 Nov 2024 10:54:39 +0100
Subject: [PATCH 069/106] readme: add community integration py-gpt (#6503)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index be01628f..d07a684f 100644
--- a/README.md
+++ b/README.md
@@ -324,6 +324,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
+- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
 - [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
 - [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)

From b4348bdd2562ce8025f8552954a3ea82f91e0d5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Charri=C3=A8re?= <ph.charriere@gmail.com>
Date: Thu, 21 Nov 2024 11:00:32 +0100
Subject: [PATCH 070/106] readme: add Parakeet to community integrations

Parakeet is a GoLang SDK for Ollama

---------

Co-authored-by: Parth Sareen <parth.sareen@ollama.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index d07a684f..92567fda 100644
--- a/README.md
+++ b/README.md
@@ -437,6 +437,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
 - [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
 - [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
+- [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
 - [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
 - [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)

From 7fbcd55da303d71111641f2315d73eafbbcbb456 Mon Sep 17 00:00:00 2001
From: Christian Tzolov <christian.tzolov@gmail.com>
Date: Thu, 21 Nov 2024 11:02:14 +0100
Subject: [PATCH 071/106] readme: Add Spring AI library reference (#5981)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 92567fda..90881e4a 100644
--- a/README.md
+++ b/README.md
@@ -402,6 +402,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
 - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
 - [crewAI](https://github.com/crewAIInc/crewAI)
+- [Spring AI](https://github.com/spring-projects/spring-ai) with [reference](https://docs.spring.io/spring-ai/reference/api/chat/ollama-chat.html) and [example](https://github.com/tzolov/ollama-tools)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)

From fb2c9594e0591417e750195205ddf8ec3d5c7158 Mon Sep 17 00:00:00 2001
From: Cyril Blaecke <1692273+cbldev@users.noreply.github.com>
Date: Thu, 21 Nov 2024 11:07:17 +0100
Subject: [PATCH 072/106] readme: Add Nosia to Community Integrations (#5381)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 90881e4a..694ca601 100644
--- a/README.md
+++ b/README.md
@@ -347,6 +347,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
 - [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt)
 - [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
+- [Nosia](https://github.com/nosia-ai/nosia) (Easy to install and use RAG platform based on Ollama)
 - [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux) 
 - [Abbey](https://github.com/US-Artificial-Intelligence/abbey) (A configurable AI interface server with notebooks, document storage, and YouTube support)
 

From 37711578a29f86cedcd100fad5c12f4f3277f8b9 Mon Sep 17 00:00:00 2001
From: emrgnt-cmplxty <68796651+emrgnt-cmplxty@users.noreply.github.com>
Date: Thu, 21 Nov 2024 02:09:36 -0800
Subject: [PATCH 073/106] readme: add R2R to community integrations (#5587)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 694ca601..7547776f 100644
--- a/README.md
+++ b/README.md
@@ -315,6 +315,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
 - [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
+- [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
 - [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education) 
 - [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application) 
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)

From 2157b1232e1b28448f2e09062d371aff46574fc3 Mon Sep 17 00:00:00 2001
From: xuyangbocn <xuyangbocn@users.noreply.github.com>
Date: Thu, 21 Nov 2024 18:28:57 +0800
Subject: [PATCH 074/106] readme: add Terraform AWS Ollama & Open WebUI
 community example (#5633)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7547776f..b0d3141d 100644
--- a/README.md
+++ b/README.md
@@ -485,6 +485,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
+- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
 - [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
 - [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
 - [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)

From baa41be2aafbcb56a7e17710e23b885d2baa5adb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AF=9B=E5=B7=B3=E7=85=9C?=
 <68628461+lemonit-eric-mao@users.noreply.github.com>
Date: Thu, 21 Nov 2024 18:51:45 +0800
Subject: [PATCH 075/106] readme: add a community made ollama web management
 tool (#7126)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b0d3141d..953ecf72 100644
--- a/README.md
+++ b/README.md
@@ -340,6 +340,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
+- [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
 - [Promptery](https://github.com/promptery/promptery) (desktop client for Ollama.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
 - [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)

From 6a0c2ec50f796e547b6db8d17396a03d0105ffe4 Mon Sep 17 00:00:00 2001
From: Paul Robello <probello@gmail.com>
Date: Thu, 21 Nov 2024 02:55:35 -0800
Subject: [PATCH 076/106] readme: add terminal tool ParLlama to community
 integrations (#5623)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 953ecf72..c0cf6638 100644
--- a/README.md
+++ b/README.md
@@ -375,6 +375,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [tlm](https://github.com/yusufcanb/tlm)
 - [podman-ollama](https://github.com/ericcurtin/podman-ollama)
 - [gollama](https://github.com/sammcj/gollama)
+- [ParLlama](https://github.com/paulrobello/parllama)
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
 - [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
 - [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor

From b7bddeebc1ed267004c1f555fb48fa1d48a37303 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Fri, 22 Nov 2024 00:28:04 +0800
Subject: [PATCH 077/106] env.sh: cleanup unused RELEASE_IMAGE_REPO (#6855)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 scripts/env.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/env.sh b/scripts/env.sh
index 22b4ee4e..c5e6f530 100644
--- a/scripts/env.sh
+++ b/scripts/env.sh
@@ -5,7 +5,6 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$V
 # TODO - consider `docker buildx ls --format=json` to autodiscover platform capability
 PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"}
 DOCKER_ORG=${DOCKER_ORG:-"ollama"}
-RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
 FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
 OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
     --build-arg=GOFLAGS \

From 27d9c749d5aee052bf9658801bdc02443728b6cc Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Thu, 21 Nov 2024 09:59:53 -0800
Subject: [PATCH 078/106] docs: remove tutorials, add cloud section to
 community integrations (#7784)

---
 README.md                       | 13 +++--
 docs/tutorials/fly-gpu.md       | 83 --------------------------------
 docs/tutorials/langchainjs.md   | 77 -----------------------------
 docs/tutorials/langchainpy.md   | 85 ---------------------------------
 docs/tutorials/nvidia-jetson.md | 15 ------
 5 files changed, 10 insertions(+), 263 deletions(-)
 delete mode 100644 docs/tutorials/fly-gpu.md
 delete mode 100644 docs/tutorials/langchainjs.md
 delete mode 100644 docs/tutorials/langchainpy.md
 delete mode 100644 docs/tutorials/nvidia-jetson.md

diff --git a/README.md b/README.md
index c0cf6638..4fbafec7 100644
--- a/README.md
+++ b/README.md
@@ -316,8 +316,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
 - [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
-- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education) 
-- [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application) 
+- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education)
+- [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)
@@ -350,9 +350,15 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt)
 - [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
 - [Nosia](https://github.com/nosia-ai/nosia) (Easy to install and use RAG platform based on Ollama)
-- [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux) 
+- [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux)
 - [Abbey](https://github.com/US-Artificial-Intelligence/abbey) (A configurable AI interface server with notebooks, document storage, and YouTube support)
 
+### Cloud
+
+- [Google Cloud](https://cloud.google.com/run/docs/tutorials/gpu-gemma2-with-ollama)
+- [Fly.io](https://fly.io/docs/python/do-more/add-ollama/)
+- [Koyeb](https://www.koyeb.com/deploy/ollama)
+
 ### Terminal
 
 - [oterm](https://github.com/ggozad/oterm)
@@ -385,6 +391,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
 
 ### Apple Vision Pro
+
 - [Enchanted](https://github.com/AugustDev/enchanted)
 
 ### Database
diff --git a/docs/tutorials/fly-gpu.md b/docs/tutorials/fly-gpu.md
deleted file mode 100644
index 24802ddb..00000000
--- a/docs/tutorials/fly-gpu.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# Running Ollama on Fly.io GPU Instances
-
-Ollama runs with little to no configuration on [Fly.io GPU instances](https://fly.io/docs/gpus/gpu-quickstart/). If you don't have access to GPUs yet, you'll need to [apply for access](https://fly.io/gpu/) on the waitlist. Once you're accepted, you'll get an email with instructions on how to get started.
-
-Create a new app with `fly apps create`:
-
-```bash
-fly apps create
-```
-
-Then create a `fly.toml` file in a new folder that looks like this:
-
-```toml
-app = "sparkling-violet-709"
-primary_region = "ord"
-vm.size = "a100-40gb" # see https://fly.io/docs/gpus/gpu-quickstart/ for more info
-
-[build]
-  image = "ollama/ollama"
-
-[http_service]
-  internal_port = 11434
-  force_https = false
-  auto_stop_machines = true
-  auto_start_machines = true
-  min_machines_running = 0
-  processes = ["app"]
-
-[mounts]
-  source = "models"
-  destination = "/root/.ollama"
-  initial_size = "100gb"
-```
-
-Then create a [new private IPv6 address](https://fly.io/docs/reference/private-networking/#flycast-private-load-balancing) for your app:
-
-```bash
-fly ips allocate-v6 --private
-```
-
-Then deploy your app:
-
-```bash
-fly deploy
-```
-
-And finally you can access it interactively with a new Fly.io Machine:
-
-```
-fly machine run -e OLLAMA_HOST=http://your-app-name.flycast --shell ollama/ollama
-```
-
-```bash
-$ ollama run openchat:7b-v3.5-fp16
->>> How do I bake chocolate chip cookies?
- To bake chocolate chip cookies, follow these steps:
-
-1. Preheat the oven to 375°F (190°C) and line a baking sheet with parchment paper or silicone baking mat.
-
-2. In a large bowl, mix together 1 cup of unsalted butter (softened), 3/4 cup granulated sugar, and 3/4
-cup packed brown sugar until light and fluffy.
-
-3. Add 2 large eggs, one at a time, to the butter mixture, beating well after each addition. Stir in 1
-teaspoon of pure vanilla extract.
-
-4. In a separate bowl, whisk together 2 cups all-purpose flour, 1/2 teaspoon baking soda, and 1/2 teaspoon
-salt. Gradually add the dry ingredients to the wet ingredients, stirring until just combined.
-
-5. Fold in 2 cups of chocolate chips (or chunks) into the dough.
-
-6. Drop rounded tablespoons of dough onto the prepared baking sheet, spacing them about 2 inches apart.
-
-7. Bake for 10-12 minutes, or until the edges are golden brown. The centers should still be slightly soft.
-
-8. Allow the cookies to cool on the baking sheet for a few minutes before transferring them to a wire rack
-to cool completely.
-
-Enjoy your homemade chocolate chip cookies!
-```
-
-When you set it up like this, it will automatically turn off when you're done using it. Then when you access it again, it will automatically turn back on. This is a great way to save money on GPU instances when you're not using them. If you want a persistent wake-on-use connection to your Ollama instance, you can set up a [connection to your Fly network using WireGuard](https://fly.io/docs/reference/private-networking/#discovering-apps-through-dns-on-a-wireguard-connection). Then you can access your Ollama instance at `http://your-app-name.flycast`.
-
-And that's it!
diff --git a/docs/tutorials/langchainjs.md b/docs/tutorials/langchainjs.md
deleted file mode 100644
index 86f895ae..00000000
--- a/docs/tutorials/langchainjs.md
+++ /dev/null
@@ -1,77 +0,0 @@
-# Using LangChain with Ollama using JavaScript
-
-In this tutorial, we are going to use JavaScript with LangChain and Ollama to learn about something just a touch more recent. In August 2023, there was a series of wildfires on Maui. There is no way an LLM trained before that time can know about this, since their training data would not include anything as recent as that. So we can find the [Wikipedia article about the fires](https://en.wikipedia.org/wiki/2023_Hawaii_wildfires) and ask questions about the contents.
-
-To get started, let's just use **LangChain** to ask a simple question to a model. To do this with JavaScript, we need to install **LangChain**:
-
-```bash
-npm install @langchain/community
-```
-
-Now we can start building out our JavaScript:
-
-```javascript
-import { Ollama } from "@langchain/community/llms/ollama";
-
-const ollama = new Ollama({
-  baseUrl: "http://localhost:11434",
-  model: "llama3.2",
-});
-
-const answer = await ollama.invoke(`why is the sky blue?`);
-
-console.log(answer);
-```
-
-That will get us the same thing as if we ran `ollama run llama3.2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
-
-```bash
-npm install cheerio
-```
-
-```javascript
-import { CheerioWebBaseLoader } from "langchain/document_loaders/web/cheerio";
-
-const loader = new CheerioWebBaseLoader("https://en.wikipedia.org/wiki/2023_Hawaii_wildfires");
-const data = await loader.load();
-```
-
-That will load the document. Although this page is smaller than the Odyssey, it is certainly bigger than the context size for most LLMs. So we are going to need to split into smaller pieces, and then select just the pieces relevant to our question. This is a great use for a vector datastore. In this example, we will use the **MemoryVectorStore** that is part of **LangChain**. But there is one more thing we need to get the content into the datastore. We have to run an embeddings process that converts the tokens in the text into a series of vectors. And for that, we are going to use **Tensorflow**. There is a lot of stuff going on in this one. First, install the **Tensorflow** components that we need.
-
-```javascript
-npm install @tensorflow/tfjs-core@3.6.0 @tensorflow/tfjs-converter@3.6.0 @tensorflow-models/universal-sentence-encoder@1.3.3 @tensorflow/tfjs-node@4.10.0
-```
-
-If you just install those components without the version numbers, it will install the latest versions, but there are conflicts within **Tensorflow**, so you need to install the compatible versions.
-
-```javascript
-import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
-import { MemoryVectorStore } from "langchain/vectorstores/memory";
-import "@tensorflow/tfjs-node";
-import { TensorFlowEmbeddings } from "langchain/embeddings/tensorflow";
-
-// Split the text into 500 character chunks. And overlap each chunk by 20 characters
-const textSplitter = new RecursiveCharacterTextSplitter({
- chunkSize: 500,
- chunkOverlap: 20
-});
-const splitDocs = await textSplitter.splitDocuments(data);
-
-// Then use the TensorFlow Embedding to store these chunks in the datastore
-const vectorStore = await MemoryVectorStore.fromDocuments(splitDocs, new TensorFlowEmbeddings());
-```
-
-To connect the datastore to a question asked to a LLM, we need to use the concept at the heart of **LangChain**: the chain. Chains are a way to connect a number of activities together to accomplish a particular tasks. There are a number of chain types available, but for this tutorial we are using the **RetrievalQAChain**.
-
-```javascript
-import { RetrievalQAChain } from "langchain/chains";
-
-const retriever = vectorStore.asRetriever();
-const chain = RetrievalQAChain.fromLLM(ollama, retriever);
-const result = await chain.call({query: "When was Hawaii's request for a major disaster declaration approved?"});
-console.log(result.text)
-```
-
-So we created a retriever, which is a way to return the chunks that match a query from a datastore. And then connect the retriever and the model via a chain. Finally, we send a query to the chain, which results in an answer using our document as a source. The answer it returned was correct, August 10, 2023.
-
-And that is a simple introduction to what you can do with **LangChain** and **Ollama.**
diff --git a/docs/tutorials/langchainpy.md b/docs/tutorials/langchainpy.md
deleted file mode 100644
index 359d3cbd..00000000
--- a/docs/tutorials/langchainpy.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# Using LangChain with Ollama in Python
-
-Let's imagine we are studying the classics, such as **the Odyssey** by **Homer**. We might have a question about Neleus and his family. If you ask llama2 for that info, you may get something like:
-
-> I apologize, but I'm a large language model, I cannot provide information on individuals or families that do not exist in reality. Neleus is not a real person or character, and therefore does not have a family or any other personal details. My apologies for any confusion. Is there anything else I can help you with?
-
-This sounds like a typical censored response, but even llama2-uncensored gives a mediocre answer:
-
-> Neleus was a legendary king of Pylos and the father of Nestor, one of the Argonauts. His mother was Clymene, a sea nymph, while his father was Neptune, the god of the sea.
-
-So let's figure out how we can use **LangChain** with Ollama to ask our question to the actual document, the Odyssey by Homer, using Python.
-
-Let's start by asking a simple question that we can get an answer to from the **Llama3** model using **Ollama**. First, we need to install the **LangChain** package:
-
-`pip install langchain_community`
-
-Then we can create a model and ask the question:
-
-```python
-from langchain_community.llms import Ollama
-ollama = Ollama(
-    base_url='http://localhost:11434',
-    model="llama3"
-)
-print(ollama.invoke("why is the sky blue"))
-```
-
-Notice that we are defining the model and the base URL for Ollama.
-
-Now let's load a document to ask questions against. I'll load up the Odyssey by Homer, which you can find at Project Gutenberg. We will need **WebBaseLoader** which is part of **LangChain** and loads text from any webpage. On my machine, I also needed to install **bs4** to get that to work, so run `pip install bs4`.
-
-```python
-from langchain.document_loaders import WebBaseLoader
-loader = WebBaseLoader("https://www.gutenberg.org/files/1727/1727-h/1727-h.htm")
-data = loader.load()
-```
-
-This file is pretty big. Just the preface is 3000 tokens. Which means the full document won't fit into the context for the model. So we need to split it up into smaller pieces.
-
-```python
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-
-text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
-all_splits = text_splitter.split_documents(data)
-```
-
-It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. We can use Ollama directly to instantiate an embedding model. We will use ChromaDB in this example for a vector database. `pip install chromadb`
-We also need to pull embedding model: `ollama pull nomic-embed-text`
-```python
-from langchain.embeddings import OllamaEmbeddings
-from langchain.vectorstores import Chroma
-oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="nomic-embed-text")
-vectorstore = Chroma.from_documents(documents=all_splits, embedding=oembed)
-```
-
-Now let's ask a question from the document. **Who was Neleus, and who is in his family?** Neleus is a character in the Odyssey, and the answer can be found in our text.
-
-```python
-question="Who is Neleus and who is in Neleus' family?"
-docs = vectorstore.similarity_search(question)
-len(docs)
-```
-
-This will output the number of matches for chunks of data similar to the search.
-
-The next thing is to send the question and the relevant parts of the docs to the model to see if we can get a good answer. But we are stitching two parts of the process together, and that is called a chain. This means we need to define a chain:
-
-```python
-from langchain.chains import RetrievalQA
-qachain=RetrievalQA.from_chain_type(ollama, retriever=vectorstore.as_retriever())
-res = qachain.invoke({"query": question})
-print(res['result'])
-```
-
-The answer received from this chain was:
-
-> Neleus is a character in Homer's "Odyssey" and is mentioned in the context of Penelope's suitors. Neleus is the father of Chloris, who is married to Neleus and bears him several children, including Nestor, Chromius, Periclymenus, and Pero. Amphinomus, the son of Nisus, is also mentioned as a suitor of Penelope and is known for his good natural disposition and agreeable conversation.
-
-It's not a perfect answer, as it implies Neleus married his daughter when actually Chloris "was the youngest daughter to Amphion son of Iasus and king of Minyan Orchomenus, and was Queen in Pylos".
-
-I updated the chunk_overlap for the text splitter to 20 and tried again and got a much better answer:
-
-> Neleus is a character in Homer's epic poem "The Odyssey." He is the husband of Chloris, who is the youngest daughter of Amphion son of Iasus and king of Minyan Orchomenus. Neleus has several children with Chloris, including Nestor, Chromius, Periclymenus, and Pero.
-
-And that is a much better answer.
diff --git a/docs/tutorials/nvidia-jetson.md b/docs/tutorials/nvidia-jetson.md
deleted file mode 100644
index bb77c486..00000000
--- a/docs/tutorials/nvidia-jetson.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Running Ollama on NVIDIA Jetson Devices
-
-Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/) and should run out of the box with the standard installation instructions. 
-
-The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack), but should also work on JetPack 6.0.
-
-- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh`
-- Pull the model you want to use (e.g. mistral): `ollama pull mistral`
-- Start an interactive session: `ollama run mistral`
-
-And that's it!
-
-# Running Ollama in Docker
-
-When running GPU accelerated applications in Docker, it is highly recommended to use [dusty-nv jetson-containers repo](https://github.com/dusty-nv/jetson-containers).
\ No newline at end of file

From eaaf5d309d0d53c0d99982eaf1b2da5da05f42fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B9=9B=E9=9C=B2=E5=85=88=E7=94=9F?=
 <zhanluxianshen@163.com>
Date: Fri, 22 Nov 2024 03:20:48 +0800
Subject: [PATCH 079/106] cmd: delete duplicated call to sb.Reset() (#7308)

Signed-off-by: zhanluxianshen <zhanluxianshen@163.com>
---
 cmd/interactive.go | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cmd/interactive.go b/cmd/interactive.go
index abbf05f4..b495a109 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -319,8 +319,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 						opts.Messages = append(opts.Messages, newMessage)
 					}
 					fmt.Println("Set system message.")
-					sb.Reset()
-
 					sb.Reset()
 					continue
 				default:

From 723f285813f504375f0e6be6c76edfbaaabd961f Mon Sep 17 00:00:00 2001
From: Elias <16616409+EliasPereirah@users.noreply.github.com>
Date: Thu, 21 Nov 2024 16:23:42 -0300
Subject: [PATCH 080/106] readme: add OrionChat to community integrations
 (#7084)

OrionChat is a free web-based chat interface that simplifies interactions
with multiple AI model providers. It provides a unified platform for chatting
and exploring multiple large language models (LLMs).
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 4fbafec7..0eaae44f 100644
--- a/README.md
+++ b/README.md
@@ -339,6 +339,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
+- [OrionChat](https://github.com/EliasPereirah/OrionChat) - OrionChat is a web interface for chatting with different AI providers
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
 - [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
 - [Promptery](https://github.com/promptery/promptery) (desktop client for Ollama.)

From 422d52858cbb2f6ed9151b46cb35479179e5bcc3 Mon Sep 17 00:00:00 2001
From: "Edwin.JH.Lee" <edwin.jh.lee@gmail.com>
Date: Fri, 22 Nov 2024 08:55:25 +0800
Subject: [PATCH 081/106] readme: add x-cmd ollama module to community
 integrations (#5191)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 0eaae44f..dea0250d 100644
--- a/README.md
+++ b/README.md
@@ -386,6 +386,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
 - [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
 - [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
+- [x-cmd ollama](https://x-cmd.com/mod/ollama)
 - [bb7](https://github.com/drunkwcodes/bb7)
 - [SwollamaCLI](https://github.com/marcusziade/Swollama) bundled with the Swollama Swift package. [Demo](https://github.com/marcusziade/Swollama?tab=readme-ov-file#cli-usage)
 - [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.

From 84b3e07f1be8f7ed6c76d15d75efc358b382b2af Mon Sep 17 00:00:00 2001
From: Dustin <h1ddenpr0cess2085@gmail.com>
Date: Thu, 21 Nov 2024 20:49:30 -0500
Subject: [PATCH 082/106] readme: add ollamarama-matrix to community
 integrations (#7325)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index dea0250d..573f7e87 100644
--- a/README.md
+++ b/README.md
@@ -344,6 +344,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
 - [Promptery](https://github.com/promptery/promptery) (desktop client for Ollama.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
+- [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
 - [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
 - [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)

From 597072ef1b4a569150a90ea7f1924d90a8649db2 Mon Sep 17 00:00:00 2001
From: Mikel Olasagasti Uranga <mikel@olasagasti.info>
Date: Fri, 22 Nov 2024 04:37:04 +0100
Subject: [PATCH 083/106] readme: update google/uuid module (#7310)

update uuid.New().String() to uuid.NewString()
---
 app/store/store.go | 2 +-
 go.mod             | 2 +-
 go.sum             | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/app/store/store.go b/app/store/store.go
index b743e8a8..370436c5 100644
--- a/app/store/store.go
+++ b/app/store/store.go
@@ -64,7 +64,7 @@ func initStore() {
 		slog.Debug(fmt.Sprintf("unexpected error searching for store: %s", err))
 	}
 	slog.Debug("initializing new store")
-	store.ID = uuid.New().String()
+	store.ID = uuid.NewString()
 	writeStore(getStorePath())
 }
 
diff --git a/go.mod b/go.mod
index 8102c6bc..7eb6a535 100644
--- a/go.mod
+++ b/go.mod
@@ -7,7 +7,7 @@ require (
 	github.com/emirpasic/gods v1.18.1
 	github.com/gin-gonic/gin v1.10.0
 	github.com/golang/protobuf v1.5.4 // indirect
-	github.com/google/uuid v1.1.2
+	github.com/google/uuid v1.6.0
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
diff --git a/go.sum b/go.sum
index e98adeaa..75b0c054 100644
--- a/go.sum
+++ b/go.sum
@@ -113,8 +113,9 @@ github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
 github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
-github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=

From 25c9339e2d9cea413f2cb1b616e84dab89b81e59 Mon Sep 17 00:00:00 2001
From: Leon Sander <72946124+Leon-Sander@users.noreply.github.com>
Date: Fri, 22 Nov 2024 05:39:38 +0100
Subject: [PATCH 084/106] readme: add Local Multimodal AI Chat app to community
 integrations (#6931)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 573f7e87..ce132b25 100644
--- a/README.md
+++ b/README.md
@@ -338,6 +338,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
 - [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
+- [Local Multimodal AI Chat](https://github.com/Leon-Sander/Local-Multimodal-AI-Chat) (Ollama-based LLM Chat with support for multiple features, including PDF RAG, voice chat, image-based interactions, and integration with OpenAI.)
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [OrionChat](https://github.com/EliasPereirah/OrionChat) - OrionChat is a web interface for chatting with different AI providers
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)

From d88972ea48cfec20ebba6e0a86a825fca3ecb193 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 22 Nov 2024 08:04:54 -0800
Subject: [PATCH 085/106] Be quiet when redirecting output (#7360)

This avoids emitting the progress indicators to stderr, and the interactive
prompts to the output file or pipe.  Running "ollama run model > out.txt"
now exits immediately, and "echo hello | ollama run model > out.txt"
produces zero stderr output and a typical response in out.txt
---
 cmd/cmd.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 91819c8e..97e821c7 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -456,6 +456,10 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	if len(prompts) > 0 {
 		interactive = false
 	}
+	// Be quiet if we're redirecting to a pipe or file
+	if !term.IsTerminal(int(os.Stdout.Fd())) {
+		interactive = false
+	}
 
 	nowrap, err := cmd.Flags().GetBool("nowordwrap")
 	if err != nil {

From b85520bfb9c9dd92056ecded55bf8c4cfd28088f Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 22 Nov 2024 08:05:32 -0800
Subject: [PATCH 086/106] logs: explain client aborts better (#7783)

Users get confused by "Failed to acquire semaphore" error="context canceled"
messages in the logs, which are actually clients giving up.  While there could be
a legitimate hang bug in the system, sometimes this is just short client timeouts
with an overloaded system, so this should help users understand what's going on
better.
---
 llama/runner/runner.go | 12 ++++++++++--
 llm/server.go          | 12 ++++++++++--
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index c7662b33..c3d0353f 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -651,7 +651,11 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 
 	// Ensure that a place to put the sequence is available
 	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
-		slog.Error("Failed to acquire semaphore", "error", err)
+		if errors.Is(err, context.Canceled) {
+			slog.Info("aborting completion request due to client closing the connection")
+		} else {
+			slog.Error("Failed to acquire semaphore", "error", err)
+		}
 		return
 	}
 	defer s.seqsSem.Release(1)
@@ -740,7 +744,11 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 
 	// Ensure that a place to put the sequence is available
 	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
-		slog.Error("Failed to acquire semaphore", "error", err)
+		if errors.Is(err, context.Canceled) {
+			slog.Info("aborting embeddings request due to client closing the connection")
+		} else {
+			slog.Error("Failed to acquire semaphore", "error", err)
+		}
 		return
 	}
 	defer s.seqsSem.Release(1)
diff --git a/llm/server.go b/llm/server.go
index d7c5198d..b2405905 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -687,7 +687,11 @@ type CompletionResponse struct {
 
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 	if err := s.sem.Acquire(ctx, 1); err != nil {
-		slog.Error("Failed to acquire semaphore", "error", err)
+		if errors.Is(err, context.Canceled) {
+			slog.Info("aborting completion request due to client closing the connection")
+		} else {
+			slog.Error("Failed to acquire semaphore", "error", err)
+		}
 		return err
 	}
 	defer s.sem.Release(1)
@@ -865,7 +869,11 @@ type EmbeddingResponse struct {
 
 func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
 	if err := s.sem.Acquire(ctx, 1); err != nil {
-		slog.Error("Failed to acquire semaphore", "error", err)
+		if errors.Is(err, context.Canceled) {
+			slog.Info("aborting embedding request due to client closing the connection")
+		} else {
+			slog.Error("Failed to acquire semaphore", "error", err)
+		}
 		return nil, err
 	}
 	defer s.sem.Release(1)

From f0a351810c496d6ead14b3d3a9d4d536c4ae772a Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 22 Nov 2024 08:05:45 -0800
Subject: [PATCH 087/106] tests: fix max queue integration test (#7782)

This had fallen out of sync with the envconfig behavior, where max queue default was not zero.
---
 integration/max_queue_test.go | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/integration/max_queue_test.go b/integration/max_queue_test.go
index ec9e085a..a2766430 100644
--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@@ -16,7 +16,6 @@ import (
 	"github.com/stretchr/testify/require"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 )
 
 func TestMaxQueue(t *testing.T) {
@@ -27,12 +26,8 @@ func TestMaxQueue(t *testing.T) {
 
 	// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
 	// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
-	threadCount := 32
-	if maxQueue := envconfig.MaxQueue(); maxQueue != 0 {
-		threadCount = int(maxQueue)
-	} else {
-		t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
-	}
+	threadCount := 16
+	t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
 
 	req := api.GenerateRequest{
 		Model:  "orca-mini",

From 7b5585b9cbc5f803583ebd6a9627c563521c8970 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Fri, 22 Nov 2024 11:57:35 -0800
Subject: [PATCH 088/106] server: remove out of date anonymous access check
 (#7785)

In the past the ollama.com server would return a JWT that contained
information about the user being authenticated. This was used to return
different error messages to the user. This is no longer possible since the
token used to authenticate does not contain information about the user
anymore. Removing this code that no longer works.

Follow up changes will improve the error messages returned here, but good to
clean up first.
---
 cmd/cmd.go       | 53 ------------------------------------------------
 server/images.go | 44 ----------------------------------------
 2 files changed, 97 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 97e821c7..fad06ffd 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -19,7 +19,6 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
-	"regexp"
 	"runtime"
 	"strconv"
 	"strings"
@@ -35,14 +34,11 @@ import (
 	"golang.org/x/term"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/server"
-	"github.com/ollama/ollama/types/errtypes"
-	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
 
@@ -516,47 +512,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	return generate(cmd, opts)
 }
 
-func errFromUnknownKey(unknownKeyErr error) error {
-	// find SSH public key in the error message
-	sshKeyPattern := `ssh-\w+ [^\s"]+`
-	re := regexp.MustCompile(sshKeyPattern)
-	matches := re.FindStringSubmatch(unknownKeyErr.Error())
-
-	if len(matches) > 0 {
-		serverPubKey := matches[0]
-
-		localPubKey, err := auth.GetPublicKey()
-		if err != nil {
-			return unknownKeyErr
-		}
-
-		if runtime.GOOS == "linux" && serverPubKey != localPubKey {
-			// try the ollama service public key
-			svcPubKey, err := os.ReadFile("/usr/share/ollama/.ollama/id_ed25519.pub")
-			if err != nil {
-				return unknownKeyErr
-			}
-			localPubKey = strings.TrimSpace(string(svcPubKey))
-		}
-
-		// check if the returned public key matches the local public key, this prevents adding a remote key to the user's account
-		if serverPubKey != localPubKey {
-			return unknownKeyErr
-		}
-
-		var msg strings.Builder
-		msg.WriteString(unknownKeyErr.Error())
-		msg.WriteString("\n\nYour ollama key is:\n")
-		msg.WriteString(localPubKey)
-		msg.WriteString("\nAdd your key at:\n")
-		msg.WriteString("https://ollama.com/settings/keys")
-
-		return errors.New(msg.String())
-	}
-
-	return unknownKeyErr
-}
-
 func PushHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -610,14 +565,6 @@ func PushHandler(cmd *cobra.Command, args []string) error {
 		if strings.Contains(err.Error(), "access denied") {
 			return errors.New("you are not authorized to push to this namespace, create the model under a namespace you own")
 		}
-		host := model.ParseName(args[0]).Host
-		isOllamaHost := strings.HasSuffix(host, ".ollama.ai") || strings.HasSuffix(host, ".ollama.com")
-		if strings.Contains(err.Error(), errtypes.UnknownOllamaKeyErrMsg) && isOllamaHost {
-			// the user has not added their ollama key to ollama.com
-			// re-throw an error with a more user-friendly message
-			return errFromUnknownKey(err)
-		}
-
 		return err
 	}
 
diff --git a/server/images.go b/server/images.go
index 6a0e8ae3..1f6a9712 100644
--- a/server/images.go
+++ b/server/images.go
@@ -5,7 +5,6 @@ import (
 	"cmp"
 	"context"
 	"crypto/sha256"
-	"encoding/base64"
 	"encoding/hex"
 	"encoding/json"
 	"errors"
@@ -24,14 +23,12 @@ import (
 	"strings"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
-	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@@ -985,37 +982,7 @@ func GetSHA256Digest(r io.Reader) (string, int64) {
 
 var errUnauthorized = errors.New("unauthorized: access denied")
 
-// getTokenSubject returns the subject of a JWT token, it does not validate the token
-func getTokenSubject(token string) string {
-	parts := strings.Split(token, ".")
-	if len(parts) != 3 {
-		return ""
-	}
-
-	payload := parts[1]
-	payloadBytes, err := base64.RawURLEncoding.DecodeString(payload)
-	if err != nil {
-		slog.Error(fmt.Sprintf("failed to decode jwt payload: %v", err))
-		return ""
-	}
-
-	var payloadMap map[string]interface{}
-	if err := json.Unmarshal(payloadBytes, &payloadMap); err != nil {
-		slog.Error(fmt.Sprintf("failed to unmarshal payload JSON: %v", err))
-		return ""
-	}
-
-	sub, ok := payloadMap["sub"]
-	if !ok {
-		slog.Error("jwt does not contain 'sub' field")
-		return ""
-	}
-
-	return fmt.Sprintf("%s", sub)
-}
-
 func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.ReadSeeker, regOpts *registryOptions) (*http.Response, error) {
-	anonymous := true // access will default to anonymous if no user is found associated with the public key
 	for range 2 {
 		resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
 		if err != nil {
@@ -1036,7 +1003,6 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 			if err != nil {
 				return nil, err
 			}
-			anonymous = getTokenSubject(token) == "anonymous"
 			regOpts.Token = token
 			if body != nil {
 				_, err = body.Seek(0, io.SeekStart)
@@ -1059,16 +1025,6 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 		}
 	}
 
-	if anonymous {
-		// no user is associated with the public key, and the request requires non-anonymous access
-		pubKey, nestedErr := auth.GetPublicKey()
-		if nestedErr != nil {
-			slog.Error(fmt.Sprintf("couldn't get public key: %v", nestedErr))
-			return nil, errUnauthorized
-		}
-		return nil, &errtypes.UnknownOllamaKey{Key: pubKey}
-	}
-	// user is associated with the public key, but is not authorized to make the request
 	return nil, errUnauthorized
 }
 

From 3478b2cf14c3fa2661c03f7fd5764a63a496293a Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 22 Nov 2024 15:17:15 -0800
Subject: [PATCH 089/106] runner.go: Fix deadlock with many concurrent requests

If there are no avilable slots for new sequences then a request
will not be added to the processing queue but will continue on
to wait for a response that never comes. Besides never giving a
response to the request, this prevents the model from being
unloaded due to the outstanding request.

To prevent this, there are semaphores that prevent more requests
from being processed than there are slots - one in the Ollama
server and one in the runner.
 - The Ollama server one works but it is not designed to protect
the runner's data internal structures and the runner can return a
final response before clearing its data structures.
 - The internal runner semaphore has similar behavior where it
 can release the semaphore when it issues a response. This is
 wrong - it should only release the semaphore after it has
 cleared the data structure.

In addition, we should return an error if a slot is not found
rather than deadlocking in the event we ever get to this spot.

Fixes #7779
---
 llama/runner/runner.go | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index c3d0353f..db8092f3 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -300,6 +300,7 @@ func (s *Server) removeSequence(seqIndex int, reason string) {
 	close(seq.embedding)
 	seq.cache.InUse = false
 	s.seqs[seqIndex] = nil
+	s.seqsSem.Release(1)
 }
 
 func (s *Server) run(ctx context.Context) {
@@ -649,7 +650,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 
-	// Ensure that a place to put the sequence is available
+	// Ensure there is a place to put the sequence, released when removed from s.seqs
 	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
 		if errors.Is(err, context.Canceled) {
 			slog.Info("aborting completion request due to client closing the connection")
@@ -658,9 +659,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		}
 		return
 	}
-	defer s.seqsSem.Release(1)
 
 	s.mu.Lock()
+	found := false
 	for i, sq := range s.seqs {
 		if sq == nil {
 			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
@@ -674,11 +675,17 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 
 			s.seqs[i] = seq
 			s.cond.Signal()
+			found = true
 			break
 		}
 	}
 	s.mu.Unlock()
 
+	if !found {
+		http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
+		return
+	}
+
 	for {
 		select {
 		case <-r.Context().Done():
@@ -742,7 +749,7 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 
-	// Ensure that a place to put the sequence is available
+	// Ensure there is a place to put the sequence, released when removed from s.seqs
 	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
 		if errors.Is(err, context.Canceled) {
 			slog.Info("aborting embeddings request due to client closing the connection")
@@ -751,9 +758,9 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 		}
 		return
 	}
-	defer s.seqsSem.Release(1)
 
 	s.mu.Lock()
+	found := false
 	for i, sq := range s.seqs {
 		if sq == nil {
 			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
@@ -764,11 +771,17 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 			}
 			s.seqs[i] = seq
 			s.cond.Signal()
+			found = true
 			break
 		}
 	}
 	s.mu.Unlock()
 
+	if !found {
+		http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
+		return
+	}
+
 	embedding := <-seq.embedding
 
 	if err := json.NewEncoder(w).Encode(&EmbeddingResponse{

From 78f779a3230dca950aca5701bad365322561540b Mon Sep 17 00:00:00 2001
From: Rodrigo Ribeiro Gomes <rodrigor.gomes@hotmail.com>
Date: Sat, 23 Nov 2024 15:08:59 -0300
Subject: [PATCH 090/106] readme: add powershai, a powershell module with
 ollama support to community integrations (#7438)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index ce132b25..41246b8e 100644
--- a/README.md
+++ b/README.md
@@ -392,6 +392,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [bb7](https://github.com/drunkwcodes/bb7)
 - [SwollamaCLI](https://github.com/marcusziade/Swollama) bundled with the Swollama Swift package. [Demo](https://github.com/marcusziade/Swollama?tab=readme-ov-file#cli-usage)
 - [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
+- [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
 
 ### Apple Vision Pro

From 31cb1ca9e567fac7f940d041d9c5f45032dd8972 Mon Sep 17 00:00:00 2001
From: oza6ut0ne <33759728+oza6ut0ne@users.noreply.github.com>
Date: Sun, 24 Nov 2024 05:39:05 +0900
Subject: [PATCH 091/106] openai: accept X-Stainless-Retry-Count header (#6910)

---
 server/routes.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/routes.go b/server/routes.go
index 5dfd6ffe..c13cd023 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -1141,7 +1141,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	config.AllowWildcard = true
 	config.AllowBrowserExtensions = true
 	config.AllowHeaders = []string{"Authorization", "Content-Type", "User-Agent", "Accept", "X-Requested-With"}
-	openAIProperties := []string{"lang", "package-version", "os", "arch", "runtime", "runtime-version", "async"}
+	openAIProperties := []string{"lang", "package-version", "os", "arch", "retry-count", "runtime", "runtime-version", "async"}
 	for _, prop := range openAIProperties {
 		config.AllowHeaders = append(config.AllowHeaders, "x-stainless-"+prop)
 	}

From bb52abfa559a4734f6fab4bc1b86ff8da66f19c1 Mon Sep 17 00:00:00 2001
From: josc146 <josStorer@outlook.com>
Date: Sun, 24 Nov 2024 05:31:27 +0800
Subject: [PATCH 092/106] readme: add ChatGPTBox and RWKV-Runner to community
 integrations (#4118)

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 41246b8e..8038b1e7 100644
--- a/README.md
+++ b/README.md
@@ -308,6 +308,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
 - [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
 - [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)
+- [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
 - [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
@@ -497,6 +498,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
 - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
+- [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
 - [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)

From 2ebdb54fb3012142a56ad9bee58298d0892f4c1a Mon Sep 17 00:00:00 2001
From: Meng Zhuo <mengzhuo@iscas.ac.cn>
Date: Sun, 24 Nov 2024 07:21:54 +0800
Subject: [PATCH 093/106] all: update math32 go mod to v1.11.0 (#6627)

---
 go.mod |  2 +-
 go.sum | 10 ++--------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/go.mod b/go.mod
index 7eb6a535..496d8d3a 100644
--- a/go.mod
+++ b/go.mod
@@ -29,7 +29,7 @@ require (
 	github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect
 	github.com/bytedance/sonic/loader v0.1.1 // indirect
 	github.com/chewxy/hm v1.0.0 // indirect
-	github.com/chewxy/math32 v1.10.1 // indirect
+	github.com/chewxy/math32 v1.11.0 // indirect
 	github.com/cloudwego/base64x v0.1.4 // indirect
 	github.com/cloudwego/iasm v0.2.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
diff --git a/go.sum b/go.sum
index 75b0c054..b3093ceb 100644
--- a/go.sum
+++ b/go.sum
@@ -21,8 +21,8 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA
 github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
 github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
 github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
-github.com/chewxy/math32 v1.10.1 h1:LFpeY0SLJXeaiej/eIp2L40VYfscTvKh/FSEZ68uMkU=
-github.com/chewxy/math32 v1.10.1/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
+github.com/chewxy/math32 v1.11.0 h1:8sek2JWqeaKkVnHa7bPVqCEOUPbARo4SGxs6toKyAOo=
+github.com/chewxy/math32 v1.11.0/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y=
 github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w=
@@ -231,8 +231,6 @@ golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+o
 golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.14.0 h1:tNgSxAFe3jC4uYqvZdTr84SZoM1KfwdC9SKIFrLjFn4=
-golang.org/x/image v0.14.0/go.mod h1:HUYqC05R2ZcZ3ejNQsIHQDQiwWM4JBqmm6MKANTp4LE=
 golang.org/x/image v0.22.0 h1:UtK5yLUzilVrkjMAZAZ34DXGpASN8i8pj8g+O+yd10g=
 golang.org/x/image v0.22.0/go.mod h1:9hPFhljd4zZ1GNSIZJ49sqbp45GKK9t6w+iXvGqZUz4=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
@@ -268,8 +266,6 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
-golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
 golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
 golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -296,8 +292,6 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
-golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
 golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
 golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

From a820d2b2673f7f8035e3a2a6f93c83af465f841c Mon Sep 17 00:00:00 2001
From: Patcher <patcher@openlit.io>
Date: Sat, 23 Nov 2024 21:03:12 -0500
Subject: [PATCH 094/106] readme: add observability section with OpenLIT to
 community-integrations

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 8038b1e7..5fc70510 100644
--- a/README.md
+++ b/README.md
@@ -514,3 +514,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Supported backends
 
 - [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
+
+### Observability
+
+- [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.

From 3440ffb37b0a02251f83d532639c629aaab3fc75 Mon Sep 17 00:00:00 2001
From: Adarsh Mishra <95633830+adarshM84@users.noreply.github.com>
Date: Mon, 25 Nov 2024 00:02:23 +0530
Subject: [PATCH 095/106] readme: add description for OpenTalkGpt in community
 integrations (#7818)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5fc70510..dde3eec7 100644
--- a/README.md
+++ b/README.md
@@ -351,7 +351,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
-- [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt)
+- [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
 - [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
 - [Nosia](https://github.com/nosia-ai/nosia) (Easy to install and use RAG platform based on Ollama)
 - [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux)

From fda1e6b563a4ac5d3fd40f2fe393911c5b79141e Mon Sep 17 00:00:00 2001
From: frob <rick+github@frob.com.au>
Date: Sun, 24 Nov 2024 19:33:33 +0100
Subject: [PATCH 096/106] llm: bring fileTypes into alignment with llama.cpp
 (#7819)

---
 llm/filetype.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llm/filetype.go b/llm/filetype.go
index 7a8e9f69..10f3d670 100644
--- a/llm/filetype.go
+++ b/llm/filetype.go
@@ -32,9 +32,10 @@ const (
 	fileTypeIQ1_S
 	fileTypeIQ4_NL
 	fileTypeIQ3_S
+	fileTypeIQ3_M
 	fileTypeIQ2_S
-	fileTypeIQ4_XS
 	fileTypeIQ2_M
+	fileTypeIQ4_XS
 	fileTypeIQ1_M
 	fileTypeBF16
 
@@ -93,6 +94,8 @@ func ParseFileType(s string) (fileType, error) {
 		return fileTypeIQ4_NL, nil
 	case "IQ3_S":
 		return fileTypeIQ3_S, nil
+	case "IQ3_M":
+		return fileTypeIQ3_M, nil
 	case "IQ2_S":
 		return fileTypeIQ2_S, nil
 	case "IQ4_XS":
@@ -160,6 +163,8 @@ func (t fileType) String() string {
 		return "IQ4_NL"
 	case fileTypeIQ3_S:
 		return "IQ3_S"
+	case fileTypeIQ3_M:
+		return "IQ3_M"
 	case fileTypeIQ2_S:
 		return "IQ2_S"
 	case fileTypeIQ4_XS:

From 3987acd7ec2825d359fa148662d4d88afe4a2476 Mon Sep 17 00:00:00 2001
From: reid41 <reid21@qq.com>
Date: Mon, 25 Nov 2024 07:55:09 +0800
Subject: [PATCH 097/106] readme: add descriptions for QA-Pilot and shell-pilot
 community integrations (#4303)

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index dde3eec7..186fedb6 100644
--- a/README.md
+++ b/README.md
@@ -298,7 +298,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
-- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Chat with Code Repository)
+- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
 - [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
 - [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
 - [RAGFlow](https://github.com/infiniflow/ragflow) (Open-source Retrieval-Augmented Generation engine based on deep document understanding)
@@ -377,7 +377,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Oatmeal](https://github.com/dustinblackman/oatmeal)
 - [cmdh](https://github.com/pgibler/cmdh)
 - [ooo](https://github.com/npahlfer/ooo)
-- [shell-pilot](https://github.com/reid41/shell-pilot)
+- [shell-pilot](https://github.com/reid41/shell-pilot)(Interact with models via pure shell scripts on Linux or macOS)
 - [tenere](https://github.com/pythops/tenere)
 - [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).
 - [typechat-cli](https://github.com/anaisbetts/typechat-cli)

From cfb1ddd6fc2a5ca755f02add26239781c77c2199 Mon Sep 17 00:00:00 2001
From: Simon Schampijer <3714785+erikos@users.noreply.github.com>
Date: Mon, 25 Nov 2024 01:06:22 +0100
Subject: [PATCH 098/106] examples: update langchain-python-simple (#3591)

- better formatting of input prompt
- use invoke instead of predict
---
 examples/langchain-python-simple/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/langchain-python-simple/main.py b/examples/langchain-python-simple/main.py
index 8d6989c8..dafff827 100644
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,6 +1,6 @@
 from langchain.llms import Ollama
 
-input = input("What is your question?")
+input = input("What is your question?\n> ")
 llm = Ollama(model="llama3.2")
-res = llm.predict(input)
+res = llm.invoke(input)
 print (res)

From a210ec74d29ee718bca9b3c192e0a93cf86cbf21 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Mon, 25 Nov 2024 09:40:16 -0800
Subject: [PATCH 099/106] cmd: print location of model after pushing (#7695)

After a user pushes their model it is not clear what to do next. Add a link
to the output of `ollama push` that tells the user where their model can now
be found.
---
 cmd/cmd.go      |  12 +++++
 cmd/cmd_test.go | 125 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index fad06ffd..01eb66f9 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -39,6 +39,7 @@ import (
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/server"
+	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
 
@@ -558,6 +559,8 @@ func PushHandler(cmd *cobra.Command, args []string) error {
 	}
 
 	request := api.PushRequest{Name: args[0], Insecure: insecure}
+
+	n := model.ParseName(args[0])
 	if err := client.Push(cmd.Context(), &request, fn); err != nil {
 		if spinner != nil {
 			spinner.Stop()
@@ -568,7 +571,16 @@ func PushHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}
 
+	p.Stop()
 	spinner.Stop()
+
+	destination := n.String()
+	if strings.HasSuffix(n.Host, ".ollama.ai") || strings.HasSuffix(n.Host, ".ollama.com") {
+		destination = "https://ollama.com/" + strings.TrimSuffix(n.DisplayShortest(), ":latest")
+	}
+	fmt.Printf("\nYou can find your model at:\n\n")
+	fmt.Printf("\t%s\n", destination)
+
 	return nil
 }
 
diff --git a/cmd/cmd_test.go b/cmd/cmd_test.go
index fd8289cf..2e6428cf 100644
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
+	"io"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -369,3 +370,127 @@ func TestGetModelfileName(t *testing.T) {
 		})
 	}
 }
+
+func TestPushHandler(t *testing.T) {
+	tests := []struct {
+		name           string
+		modelName      string
+		serverResponse map[string]func(w http.ResponseWriter, r *http.Request)
+		expectedError  string
+		expectedOutput string
+	}{
+		{
+			name:      "successful push",
+			modelName: "test-model",
+			serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){
+				"/api/push": func(w http.ResponseWriter, r *http.Request) {
+					if r.Method != http.MethodPost {
+						t.Errorf("expected POST request, got %s", r.Method)
+					}
+
+					var req api.PushRequest
+					if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+						http.Error(w, err.Error(), http.StatusBadRequest)
+						return
+					}
+
+					if req.Name != "test-model" {
+						t.Errorf("expected model name 'test-model', got %s", req.Name)
+					}
+
+					// Simulate progress updates
+					responses := []api.ProgressResponse{
+						{Status: "preparing manifest"},
+						{Digest: "sha256:abc123456789", Total: 100, Completed: 50},
+						{Digest: "sha256:abc123456789", Total: 100, Completed: 100},
+					}
+
+					for _, resp := range responses {
+						if err := json.NewEncoder(w).Encode(resp); err != nil {
+							http.Error(w, err.Error(), http.StatusInternalServerError)
+							return
+						}
+						w.(http.Flusher).Flush()
+					}
+				},
+			},
+			expectedOutput: "\nYou can find your model at:\n\n\thttps://ollama.com/test-model\n",
+		},
+		{
+			name:      "unauthorized push",
+			modelName: "unauthorized-model",
+			serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){
+				"/api/push": func(w http.ResponseWriter, r *http.Request) {
+					w.Header().Set("Content-Type", "application/json")
+					w.WriteHeader(http.StatusUnauthorized)
+					err := json.NewEncoder(w).Encode(map[string]string{
+						"error": "access denied",
+					})
+					if err != nil {
+						t.Fatal(err)
+					}
+				},
+			},
+			expectedError: "you are not authorized to push to this namespace, create the model under a namespace you own",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				if handler, ok := tt.serverResponse[r.URL.Path]; ok {
+					handler(w, r)
+					return
+				}
+				http.Error(w, "not found", http.StatusNotFound)
+			}))
+			defer mockServer.Close()
+
+			t.Setenv("OLLAMA_HOST", mockServer.URL)
+
+			cmd := &cobra.Command{}
+			cmd.Flags().Bool("insecure", false, "")
+			cmd.SetContext(context.TODO())
+
+			// Redirect stderr to capture progress output
+			oldStderr := os.Stderr
+			r, w, _ := os.Pipe()
+			os.Stderr = w
+
+			// Capture stdout for the "Model pushed" message
+			oldStdout := os.Stdout
+			outR, outW, _ := os.Pipe()
+			os.Stdout = outW
+
+			err := PushHandler(cmd, []string{tt.modelName})
+
+			// Restore stderr
+			w.Close()
+			os.Stderr = oldStderr
+			// drain the pipe
+			if _, err := io.ReadAll(r); err != nil {
+				t.Fatal(err)
+			}
+
+			// Restore stdout and get output
+			outW.Close()
+			os.Stdout = oldStdout
+			stdout, _ := io.ReadAll(outR)
+
+			if tt.expectedError == "" {
+				if err != nil {
+					t.Errorf("expected no error, got %v", err)
+				}
+				if tt.expectedOutput != "" {
+					if got := string(stdout); got != tt.expectedOutput {
+						t.Errorf("expected output %q, got %q", tt.expectedOutput, got)
+					}
+				}
+			} else {
+				if err == nil || !strings.Contains(err.Error(), tt.expectedError) {
+					t.Errorf("expected error containing %q, got %v", tt.expectedError, err)
+				}
+			}
+		})
+	}
+}

From 647513a7d48920f897f536fe9df45c6ca38fe83e Mon Sep 17 00:00:00 2001
From: Shikhar Bakhda <shikharbakhda@gmail.com>
Date: Mon, 25 Nov 2024 09:55:33 -0800
Subject: [PATCH 100/106] readme: add HoneyHive to community integrations
 (#7831)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 186fedb6..1e1d02bc 100644
--- a/README.md
+++ b/README.md
@@ -518,3 +518,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Observability
 
 - [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
+- [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production. 

From 2b7ed61ca22743598db2b407a94b8865042f1078 Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Mon, 25 Nov 2024 15:08:34 -0800
Subject: [PATCH 101/106] server: fix Transport override (#7834)

This changes makeRequest to update the http client Transport if and only
if testMakeRequestDialContext is set. This is to avoid overriding the
default Transport when testMakeRequestDialContext is nil, which broke
existing behavior, included proxies, timeouts, and other behaviors.

Fixes #7829
Fixes #7788
---
 server/images.go | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/server/images.go b/server/images.go
index 1f6a9712..29877db3 100644
--- a/server/images.go
+++ b/server/images.go
@@ -1076,17 +1076,15 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
 		req.ContentLength = contentLength
 	}
 
-	resp, err := (&http.Client{
-		Transport: &http.Transport{
-			DialContext: testMakeRequestDialContext,
-		},
+	c := &http.Client{
 		CheckRedirect: regOpts.CheckRedirect,
-	}).Do(req)
-	if err != nil {
-		return nil, err
 	}
-
-	return resp, nil
+	if testMakeRequestDialContext != nil {
+		tr := http.DefaultTransport.(*http.Transport).Clone()
+		tr.DialContext = testMakeRequestDialContext
+		c.Transport = tr
+	}
+	return c.Do(req)
 }
 
 func getValue(header, key string) string {

From 30e88d7f31cd3af582346b995a8bb10b3ff37125 Mon Sep 17 00:00:00 2001
From: frob <rick+github@frob.com.au>
Date: Tue, 26 Nov 2024 01:43:29 +0100
Subject: [PATCH 102/106] cmd: don't submit svg files as images for now (#7830)

---
 cmd/interactive.go      |  2 +-
 cmd/interactive_test.go | 23 ++++++++++++-----------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/cmd/interactive.go b/cmd/interactive.go
index b495a109..9035b4c5 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -514,7 +514,7 @@ func extractFileNames(input string) []string {
 	// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
 	// and followed by more characters and a file extension
 	// This will capture non filename strings, but we'll check for file existence to remove mismatches
-	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
+	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
 	re := regexp.MustCompile(regexPattern)
 
 	return re.FindAllString(input, -1)
diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go
index bb7e0aba..118f4264 100644
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -12,44 +12,45 @@ import (
 func TestExtractFilenames(t *testing.T) {
 	// Unix style paths
 	input := ` some preamble 
- ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2
-/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.svg`
+ ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
+/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
 	res := extractFileNames(input)
 	assert.Len(t, res, 5)
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
-	assert.Contains(t, res[4], "five.svg")
+	assert.Contains(t, res[4], "five.JPG")
 	assert.NotContains(t, res[4], '"')
-	assert.NotContains(t, res, "inbtween")
+	assert.NotContains(t, res, "inbetween1")
+	assert.NotContains(t, res, "./1.svg")
 
 	// Windows style paths
 	input = ` some preamble
  c:/users/jdoe/one.png inbetween1 c:/program files/someplace/two.jpg inbetween2 
  /absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
-./relative\ path/five.svg inbetween5 "./relative with/spaces/six.png inbetween6
-d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8 
- d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.svg some ending
+./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
+d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8 
+ d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
 `
 	res = extractFileNames(input)
 	assert.Len(t, res, 10)
-	assert.NotContains(t, res, "inbtween")
+	assert.NotContains(t, res, "inbetween2")
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[0], "c:")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[1], "c:")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
-	assert.Contains(t, res[4], "five.svg")
+	assert.Contains(t, res[4], "five.JPG")
 	assert.Contains(t, res[5], "six.png")
-	assert.Contains(t, res[6], "seven.svg")
+	assert.Contains(t, res[6], "seven.JPEG")
 	assert.Contains(t, res[6], "d:")
 	assert.Contains(t, res[7], "eight.png")
 	assert.Contains(t, res[7], "c:")
 	assert.Contains(t, res[8], "nine.png")
 	assert.Contains(t, res[8], "d:")
-	assert.Contains(t, res[9], "ten.svg")
+	assert.Contains(t, res[9], "ten.PNG")
 	assert.Contains(t, res[9], "E:")
 }
 

From 52bbad12f96e84f7d62c5dfdd7dbba2b10b37344 Mon Sep 17 00:00:00 2001
From: jake83741 <125723241+jake83741@users.noreply.github.com>
Date: Mon, 25 Nov 2024 20:56:30 -0500
Subject: [PATCH 103/106] readme: update description for vnc-lm community
 integration (#7832)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1e1d02bc..52f6fa55 100644
--- a/README.md
+++ b/README.md
@@ -504,7 +504,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
 - [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
 - [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
-- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
+- [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
 - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
 - [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)

From 2cd11ae365a9423578069457312dce6b9e1e5a37 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 25 Nov 2024 14:49:38 -0800
Subject: [PATCH 104/106] runner.go: Add unit tests for context shifting

This also makes it easier to truncate long inputs the same as
shifting but does not actually implement it. This type of
truncation has a trade off between quality and time to first
token.
---
 llama/runner/cache.go      | 20 +++++++++---
 llama/runner/cache_test.go | 63 ++++++++++++++++++++++++++++++++++++++
 llama/runner/runner.go     |  6 ++--
 3 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/llama/runner/cache.go b/llama/runner/cache.go
index b487fe25..0f5f0a09 100644
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@@ -199,6 +199,20 @@ func countCommonPrefix(a []input, b []input) int {
 	return count
 }
 
+func (c *InputCache) ShiftDiscard(inputLen int, numKeep int) int {
+	targetFree := (c.numCtx - numKeep) / 2
+	targetFree = max(targetFree, 1)
+
+	currentFree := c.numCtx - inputLen
+	discard := targetFree - currentFree
+
+	if discard < 0 {
+		discard = 0
+	}
+
+	return discard
+}
+
 // Frees up space in the KV cache by deleting the oldest half of history and shifting
 // the newest half into that space (saving numKeep inputs at the beginning).
 //
@@ -208,11 +222,7 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) error {
 		return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
 	}
 
-	targetFree := (c.numCtx - numKeep) / 2
-	targetFree = max(targetFree, 1)
-
-	currentFree := c.numCtx - len(slot.Inputs)
-	discard := targetFree - currentFree
+	discard := c.ShiftDiscard(len(slot.Inputs), numKeep)
 
 	if discard <= 0 {
 		return nil
diff --git a/llama/runner/cache_test.go b/llama/runner/cache_test.go
index 0e38c67d..79cd93cb 100644
--- a/llama/runner/cache_test.go
+++ b/llama/runner/cache_test.go
@@ -227,3 +227,66 @@ func TestFindCacheSlot(t *testing.T) {
 		})
 	}
 }
+
+func TestShiftDiscard(t *testing.T) {
+	tests := []struct {
+		name     string
+		numCtx   int
+		numKeep  int
+		inputLen int
+		expected int
+	}{
+		{
+			name:     "Shift",
+			numCtx:   2048,
+			numKeep:  5,
+			inputLen: 2048,
+			expected: 1021,
+		},
+		{
+			name:     "Max Keep",
+			numCtx:   2048,
+			numKeep:  2047,
+			inputLen: 2048,
+			expected: 1,
+		},
+		{
+			name:     "No Keep",
+			numCtx:   2048,
+			numKeep:  0,
+			inputLen: 2048,
+			expected: 1024,
+		},
+		{
+			name:     "Truncate",
+			numCtx:   2048,
+			numKeep:  5,
+			inputLen: 5000,
+			expected: 3973,
+		},
+		{
+			name:     "Truncate Keep",
+			numCtx:   2048,
+			numKeep:  2047,
+			inputLen: 5000,
+			expected: 2953,
+		},
+		{
+			name:     "No Op",
+			numCtx:   2048,
+			numKeep:  5,
+			inputLen: 512,
+			expected: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			c := InputCache{numCtx: tt.numCtx}
+			result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
+			if result != tt.expected {
+				t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
+			}
+		})
+	}
+}
diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index db8092f3..8762b3da 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -122,9 +122,11 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 	params.numKeep = min(params.numKeep, s.cache.numCtx-1)
 
 	if len(inputs) > s.cache.numCtx {
-		slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep)
+		discard := len(inputs) - s.cache.numCtx
 		newInputs := inputs[:params.numKeep]
-		newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...)
+		newInputs = append(newInputs, inputs[params.numKeep+discard:]...)
+
+		slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "keep", params.numKeep, "new", len(newInputs))
 		inputs = newInputs
 	}
 

From 71e6a0d0d181e3be45f3e47a677d088479d73c76 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 20 Nov 2024 15:08:24 -0800
Subject: [PATCH 105/106] runner.go: Don't try to extract image tags for text
 models

When processing a prompt, we look for image tags of the form
[img-0], which are inserted by the Ollama server process.
However, this can cause errors if the original prompt has these
tags - typically an image not found error is returned.

This changes tag searching behavior to be similar to the 0.3.x
series, which will largely avoid these problems. However,they can
still happen when input text with these tags is used with image
models. The correct solution is to escape the tags but this is a
larger issue with special sequences in general so this is an
incremental fix that should avoid the problem for the majority
of cases.
---
 llama/runner/runner.go | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index 8762b3da..0255ed55 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -164,10 +164,16 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 // generating image embeddings for each image
 func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 	var inputs []input
+	var parts []string
+	var matches [][]string
 
-	re := regexp.MustCompile(`\[img-(\d+)\]`)
-	parts := re.Split(prompt, -1)
-	matches := re.FindAllStringSubmatch(prompt, -1)
+	if s.image != nil {
+		re := regexp.MustCompile(`\[img-(\d+)\]`)
+		parts = re.Split(prompt, -1)
+		matches = re.FindAllStringSubmatch(prompt, -1)
+	} else {
+		parts = []string{prompt}
+	}
 
 	for i, part := range parts {
 		// text - tokenize

From 940e62772e68c99cd4cb0b037acf5c16c23e0854 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 26 Nov 2024 16:08:09 -0800
Subject: [PATCH 106/106] openai: remove unused error code (#7850)

The writeError takes a code argument which is no longer used. Remove it for clarity.
---
 openai/openai.go | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/openai/openai.go b/openai/openai.go
index 2bf9b9f9..10e5b09e 100644
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -571,7 +571,7 @@ type EmbedWriter struct {
 	model string
 }
 
-func (w *BaseWriter) writeError(code int, data []byte) (int, error) {
+func (w *BaseWriter) writeError(data []byte) (int, error) {
 	var serr api.StatusError
 	err := json.Unmarshal(data, &serr)
 	if err != nil {
@@ -630,7 +630,7 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {
 func (w *ChatWriter) Write(data []byte) (int, error) {
 	code := w.ResponseWriter.Status()
 	if code != http.StatusOK {
-		return w.writeError(code, data)
+		return w.writeError(data)
 	}
 
 	return w.writeResponse(data)
@@ -679,7 +679,7 @@ func (w *CompleteWriter) writeResponse(data []byte) (int, error) {
 func (w *CompleteWriter) Write(data []byte) (int, error) {
 	code := w.ResponseWriter.Status()
 	if code != http.StatusOK {
-		return w.writeError(code, data)
+		return w.writeError(data)
 	}
 
 	return w.writeResponse(data)
@@ -704,7 +704,7 @@ func (w *ListWriter) writeResponse(data []byte) (int, error) {
 func (w *ListWriter) Write(data []byte) (int, error) {
 	code := w.ResponseWriter.Status()
 	if code != http.StatusOK {
-		return w.writeError(code, data)
+		return w.writeError(data)
 	}
 
 	return w.writeResponse(data)
@@ -730,7 +730,7 @@ func (w *RetrieveWriter) writeResponse(data []byte) (int, error) {
 func (w *RetrieveWriter) Write(data []byte) (int, error) {
 	code := w.ResponseWriter.Status()
 	if code != http.StatusOK {
-		return w.writeError(code, data)
+		return w.writeError(data)
 	}
 
 	return w.writeResponse(data)
@@ -755,7 +755,7 @@ func (w *EmbedWriter) writeResponse(data []byte) (int, error) {
 func (w *EmbedWriter) Write(data []byte) (int, error) {
 	code := w.ResponseWriter.Status()
 	if code != http.StatusOK {
-		return w.writeError(code, data)
+		return w.writeError(data)
 	}
 
 	return w.writeResponse(data)