Merge https://github.com/ollama/ollama

Signed-off-by: baalajimaestro <baalajimaestro@ptr.moe>
2024-11-30 23:59:00 +05:30 · 2024-11-30 23:58:51 +05:30 · 2024-11-29 20:00:09 -08:00 · 2024-11-28 17:27:11 -08:00 · 2024-11-28 15:16:27 -08:00 · 2024-11-27 13:40:57 -08:00
54 changed files with 1413 additions and 726 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -281,7 +281,7 @@ jobs:
        shell: bash
      - uses: golangci/golangci-lint-action@v6
        with:
-          args: --timeout 8m0s -v
+          args: --timeout 10m0s -v
  test:
    strategy:
      matrix:
--- a/README.md
+++ b/README.md
@ -47,26 +47,28 @@ Ollama supports a list of models available on [ollama.com/library](https://ollam
 Here are some example models that can be downloaded:
-| Model              | Parameters | Size  | Download                       |
+| Model              | Parameters | Size  | Download                         |
-| ------------------ | ---------- | ----- | ------------------------------ |
+| ------------------ | ---------- | ----- | -------------------------------- |
-| Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`          |
+| Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`            |
-| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`       |
+| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`         |
-| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`          |
+| Llama 3.2 Vision   | 11B        | 7.9GB | `ollama run llama3.2-vision`     |
-| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`      |
+| Llama 3.2 Vision   | 90B        | 55GB  | `ollama run llama3.2-vision:90b` |
-| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`     |
+| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`            |
-| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
+| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`        |
-| Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
+| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`       |
-| Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`         |
+| Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`                |
-| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
+| Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`         |
-| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`        |
+| Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`           |
-| Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
+| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`              |
-| Moondream 2        | 1.4B       | 829MB | `ollama run moondream`         |
+| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`          |
-| Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
+| Mistral            | 7B         | 4.1GB | `ollama run mistral`             |
-| Starling           | 7B         | 4.1GB | `ollama run starling-lm`       |
+| Moondream 2        | 1.4B       | 829MB | `ollama run moondream`           |
-| Code Llama         | 7B         | 3.8GB | `ollama run codellama`         |
+| Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`         |
-| Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored` |
+| Starling           | 7B         | 4.1GB | `ollama run starling-lm`         |
-| LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
+| Code Llama         | 7B         | 3.8GB | `ollama run codellama`           |
-| Solar              | 10.7B      | 6.1GB | `ollama run solar`             |
+| Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored`   |
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`               |
 | Solar              | 10.7B      | 6.1GB | `ollama run solar`               |
 > [!NOTE]
 > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@ -296,7 +298,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Chat with Code Repository)
+- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
 - [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
 - [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
 - [RAGFlow](https://github.com/infiniflow/ragflow) (Open-source Retrieval-Augmented Generation engine based on deep document understanding)
@ -306,11 +308,17 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
 - [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
 - [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)
 - [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
 - [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
 - [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
 - [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
 - [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
 - [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education)
 - [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)
@ -318,6 +326,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
 - [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
 - [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
 - [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
 - [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
@ -327,12 +337,34 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
 - [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 - [Local Multimodal AI Chat](https://github.com/Leon-Sander/Local-Multimodal-AI-Chat) (Ollama-based LLM Chat with support for multiple features, including PDF RAG, voice chat, image-based interactions, and integration with OpenAI.)
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [OrionChat](https://github.com/EliasPereirah/OrionChat) - OrionChat is a web interface for chatting with different AI providers
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
 - [Web management](https://github.com/lemonit-eric-mao/ollama-web-management) (Web management page)
 - [Promptery](https://github.com/promptery/promptery) (desktop client for Ollama.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
 - [SpaceLlama](https://github.com/tcsenpai/spacellama) (Firefox and Chrome extension to quickly summarize web pages with ollama in a sidebar)
 - [YouLama](https://github.com/tcsenpai/youlama) (Webapp to quickly summarize any YouTube video, supporting Invidious as well)
 - [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
 - [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
 - [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
 - [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
- [Reddit Rate]((https://github.com/rapidarchitect/reddit_analyzer)) (Search and Rate Reddit topics with a weighted summation)
+- [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
 - [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
 - [VT](https://github.com/vinhnx/vt.ai) (A minimal multimodal AI chat app, with dynamic conversation routing. Supports local models via Ollama)
 - [Nosia](https://github.com/nosia-ai/nosia) (Easy to install and use RAG platform based on Ollama)
 - [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux)
 - [Abbey](https://github.com/US-Artificial-Intelligence/abbey) (A configurable AI interface server with notebooks, document storage, and YouTube support)
 ### Cloud
 - [Google Cloud](https://cloud.google.com/run/docs/tutorials/gpu-gemma2-with-ollama)
 - [Fly.io](https://fly.io/docs/python/do-more/add-ollama/)
 - [Koyeb](https://www.koyeb.com/deploy/ollama)
 ### Terminal
@ -348,7 +380,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Oatmeal](https://github.com/dustinblackman/oatmeal)
 - [cmdh](https://github.com/pgibler/cmdh)
 - [ooo](https://github.com/npahlfer/ooo)
- [shell-pilot](https://github.com/reid41/shell-pilot)
+- [shell-pilot](https://github.com/reid41/shell-pilot)(Interact with models via pure shell scripts on Linux or macOS)
 - [tenere](https://github.com/pythops/tenere)
 - [llm-ollama](https://github.com/taketwo/llm-ollama) for [Datasette's LLM CLI](https://llm.datasette.io/en/stable/).
 - [typechat-cli](https://github.com/anaisbetts/typechat-cli)
@ -356,11 +388,19 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [tlm](https://github.com/yusufcanb/tlm)
 - [podman-ollama](https://github.com/ericcurtin/podman-ollama)
 - [gollama](https://github.com/sammcj/gollama)
 - [ParLlama](https://github.com/paulrobello/parllama)
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
 - [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
 - [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor
 - [x-cmd ollama](https://x-cmd.com/mod/ollama)
 - [bb7](https://github.com/drunkwcodes/bb7)
 - [SwollamaCLI](https://github.com/marcusziade/Swollama) bundled with the Swollama Swift package. [Demo](https://github.com/marcusziade/Swollama?tab=readme-ov-file#cli-usage)
 - [aichat](https://github.com/sigoden/aichat) All-in-one LLM CLI tool featuring Shell Assistant, Chat-REPL, RAG, AI tools & agents, with access to OpenAI, Claude, Gemini, Ollama, Groq, and more.
 - [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
 ### Apple Vision Pro
 - [Enchanted](https://github.com/AugustDev/enchanted)
 ### Database
@ -382,9 +422,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
 - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
 - [crewAI](https://github.com/crewAIInc/crewAI)
 - [Spring AI](https://github.com/spring-projects/spring-ai) with [reference](https://docs.spring.io/spring-ai/reference/api/chat/ollama-chat.html) and [example](https://github.com/tzolov/ollama-tools)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
 - [LLPhant](https://github.com/theodo-group/LLPhant?tab=readme-ov-file#ollama)
 - [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/llm/ollama/) and [LlamaIndexTS](https://ts.llamaindex.ai/modules/llms/available_llms/ollama)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
@ -409,12 +451,20 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
 - [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
 - [LlamaScript](https://github.com/Project-Llama/llamascript)
 - [llm-axe](https://github.com/emirsahin1/llm-axe) (Python Toolkit for Building LLM Powered Apps)
 - [Gollm](https://docs.gollm.co/examples/ollama-example)
 - [Gollama for Golang](https://github.com/jonathanhecl/gollama)
 - [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)
 - [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
 - [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
 - [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
 - [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
 - [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
 - [Ollama for Swift](https://github.com/mattt/ollama-swift)
 - [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
 - [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
 ### Mobile
@ -428,6 +478,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Raycast extension](https://github.com/MassimilianoPasquini97/raycast_ollama)
 - [Discollama](https://github.com/mxyng/discollama) (Discord bot inside the Ollama discord channel)
 - [Continue](https://github.com/continuedev/continue)
 - [Vibe](https://github.com/thewh1teagle/vibe) (Transcribe and analyze meetings with Ollama)
 - [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
 - [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
 - [NotesOllama](https://github.com/andersrex/notesollama) (Apple Notes Ollama plugin)
@ -450,15 +501,24 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
 - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
+- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
 - [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
 - [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
 - [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
 - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
 - [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
 - [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
 - [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
 ### Supported backends
 - [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
 ### Observability
 - [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
 - [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production. 
--- a/api/client.go
+++ b/api/client.go
@ -55,7 +55,7 @@ func checkError(resp *http.Response, body []byte) error {
 // ClientFromEnvironment creates a new [Client] using configuration from the
 // environment variable OLLAMA_HOST, which points to the network host and
-// port on which the ollama service is listenting. The format of this variable
+// port on which the ollama service is listening. The format of this variable
 // is:
 //
 //	<scheme>://<host>:<port>
--- a/api/types.go
+++ b/api/types.go
@ -12,7 +12,7 @@ import (
 	"time"
 )
-// StatusError is an error with and HTTP status code.
+// StatusError is an error with an HTTP status code and message.
 type StatusError struct {
 	StatusCode   int
 	Status       string
@ -57,7 +57,7 @@ type GenerateRequest struct {
 	Template string `json:"template"`
 	// Context is the context parameter returned from a previous call to
-	// Generate call. It can be used to keep a short conversational memory.
+	// [Client.Generate]. It can be used to keep a short conversational memory.
 	Context []int `json:"context,omitempty"`
 	// Stream specifies whether the response is streaming; it is true by default.
@ -90,14 +90,14 @@ type ChatRequest struct {
 	// Messages is the messages of the chat - can be used to keep a chat memory.
 	Messages []Message `json:"messages"`
-	// Stream enable streaming of returned response; true by default.
+	// Stream enables streaming of returned responses; true by default.
 	Stream *bool `json:"stream,omitempty"`
 	// Format is the format to return the response in (e.g. "json").
 	Format string `json:"format"`
 	// KeepAlive controls how long the model will stay loaded into memory
-	// followin the request.
+	// following the request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`
 	// Tools is an optional list of tools the model has access to.
@ -146,6 +146,7 @@ type ToolCall struct {
 }
 type ToolCallFunction struct {
 	Index     int                       `json:"index,omitempty"`
 	Name      string                    `json:"name"`
 	Arguments ToolCallFunctionArguments `json:"arguments"`
 }
@ -203,8 +204,8 @@ type Metrics struct {
 	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
 }
-// Options specified in [GenerateRequest], if you add a new option here add it
+// Options specified in [GenerateRequest].  If you add a new option here, also
-// to the API docs also.
+// add it to the API docs.
 type Options struct {
 	Runner
--- a/app/store/store.go
+++ b/app/store/store.go
@ -64,7 +64,7 @@ func initStore() {
 		slog.Debug(fmt.Sprintf("unexpected error searching for store: %s", err))
 	}
 	slog.Debug("initializing new store")
-	store.ID = uuid.New().String()
+	store.ID = uuid.NewString()
 	writeStore(getStorePath())
 }
--- a/app/tray/wintray/menus.go
+++ b/app/tray/wintray/menus.go
@ -39,7 +39,7 @@ func (t *winTray) UpdateAvailable(ver string) error {
 		if err := t.addOrUpdateMenuItem(updateAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
 			return fmt.Errorf("unable to create menu entries %w", err)
 		}
-		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
+		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenuTitle, false); err != nil {
 			return fmt.Errorf("unable to create menu entries %w", err)
 		}
 		if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
--- a/app/tray/wintray/messages.go
+++ b/app/tray/wintray/messages.go
@ -10,6 +10,6 @@ const (
 	quitMenuTitle            = "Quit Ollama"
 	updateAvailableMenuTitle = "An update is available"
-	updateMenutTitle         = "Restart to update"
+	updateMenuTitle          = "Restart to update"
 	diagLogsMenuTitle        = "View logs"
 )
--- a/app/tray/wintray/tray.go
+++ b/app/tray/wintray/tray.go
@ -361,7 +361,7 @@ func (t *winTray) showMenu() error {
 	boolRet, _, err = pTrackPopupMenu.Call(
 		uintptr(t.menus[0]),
-		TPM_BOTTOMALIGN|TPM_LEFTALIGN,
+		TPM_BOTTOMALIGN|TPM_LEFTALIGN|TPM_RIGHTBUTTON,
 		uintptr(p.X),
 		uintptr(p.Y),
 		0,
--- a/app/tray/wintray/w32api.go
+++ b/app/tray/wintray/w32api.go
@ -67,6 +67,7 @@ const (
 	SW_HIDE             = 0
 	TPM_BOTTOMALIGN     = 0x0020
 	TPM_LEFTALIGN       = 0x0000
 	TPM_RIGHTBUTTON     = 0x0002
 	WM_CLOSE            = 0x0010
 	WM_USER             = 0x0400
 	WS_CAPTION          = 0x00C00000
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -19,7 +19,6 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
 	"regexp"
 	"runtime"
 	"strconv"
 	"strings"
@ -35,13 +34,11 @@ import (
 	"golang.org/x/term"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/server"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@ -456,6 +453,10 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	if len(prompts) > 0 {
 		interactive = false
 	}
 	// Be quiet if we're redirecting to a pipe or file
 	if !term.IsTerminal(int(os.Stdout.Fd())) {
 		interactive = false
 	}
 	nowrap, err := cmd.Flags().GetBool("nowordwrap")
 	if err != nil {
@ -512,47 +513,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	return generate(cmd, opts)
 }
 func errFromUnknownKey(unknownKeyErr error) error {
 	// find SSH public key in the error message
 	sshKeyPattern := `ssh-\w+ [^\s"]+`
 	re := regexp.MustCompile(sshKeyPattern)
 	matches := re.FindStringSubmatch(unknownKeyErr.Error())
 	if len(matches) > 0 {
 		serverPubKey := matches[0]
 		localPubKey, err := auth.GetPublicKey()
 		if err != nil {
 			return unknownKeyErr
 		}
 		if runtime.GOOS == "linux" && serverPubKey != localPubKey {
 			// try the ollama service public key
 			svcPubKey, err := os.ReadFile("/usr/share/ollama/.ollama/id_ed25519.pub")
 			if err != nil {
 				return unknownKeyErr
 			}
 			localPubKey = strings.TrimSpace(string(svcPubKey))
 		}
 		// check if the returned public key matches the local public key, this prevents adding a remote key to the user's account
 		if serverPubKey != localPubKey {
 			return unknownKeyErr
 		}
 		var msg strings.Builder
 		msg.WriteString(unknownKeyErr.Error())
 		msg.WriteString("\n\nYour ollama key is:\n")
 		msg.WriteString(localPubKey)
 		msg.WriteString("\nAdd your key at:\n")
 		msg.WriteString("https://ollama.com/settings/keys")
 		return errors.New(msg.String())
 	}
 	return unknownKeyErr
 }
 func PushHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@ -599,6 +559,8 @@ func PushHandler(cmd *cobra.Command, args []string) error {
 	}
 	request := api.PushRequest{Name: args[0], Insecure: insecure}
 	n := model.ParseName(args[0])
 	if err := client.Push(cmd.Context(), &request, fn); err != nil {
 		if spinner != nil {
 			spinner.Stop()
@ -606,18 +568,19 @@ func PushHandler(cmd *cobra.Command, args []string) error {
 		if strings.Contains(err.Error(), "access denied") {
 			return errors.New("you are not authorized to push to this namespace, create the model under a namespace you own")
 		}
 		host := model.ParseName(args[0]).Host
 		isOllamaHost := strings.HasSuffix(host, ".ollama.ai") || strings.HasSuffix(host, ".ollama.com")
 		if strings.Contains(err.Error(), errtypes.UnknownOllamaKeyErrMsg) && isOllamaHost {
 			// the user has not added their ollama key to ollama.com
 			// re-throw an error with a more user-friendly message
 			return errFromUnknownKey(err)
 		}
 		return err
 	}
 	p.Stop()
 	spinner.Stop()
 	destination := n.String()
 	if strings.HasSuffix(n.Host, ".ollama.ai") || strings.HasSuffix(n.Host, ".ollama.com") {
 		destination = "https://ollama.com/" + strings.TrimSuffix(n.DisplayShortest(), ":latest")
 	}
 	fmt.Printf("\nYou can find your model at:\n\n")
 	fmt.Printf("\t%s\n", destination)
 	return nil
 }
@ -800,9 +763,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		case "parameters":
 			fmt.Println(resp.Parameters)
 		case "system":
-			fmt.Println(resp.System)
+			fmt.Print(resp.System)
 		case "template":
-			fmt.Println(resp.Template)
+			fmt.Print(resp.Template)
 		}
 		return nil
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@ -4,6 +4,7 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"io"
 	"net/http"
 	"net/http/httptest"
 	"os"
@ -369,3 +370,127 @@ func TestGetModelfileName(t *testing.T) {
 		})
 	}
 }
 func TestPushHandler(t *testing.T) {
 	tests := []struct {
 		name           string
 		modelName      string
 		serverResponse map[string]func(w http.ResponseWriter, r *http.Request)
 		expectedError  string
 		expectedOutput string
 	}{
 		{
 			name:      "successful push",
 			modelName: "test-model",
 			serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){
 				"/api/push": func(w http.ResponseWriter, r *http.Request) {
 					if r.Method != http.MethodPost {
 						t.Errorf("expected POST request, got %s", r.Method)
 					}
 					var req api.PushRequest
 					if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
 						http.Error(w, err.Error(), http.StatusBadRequest)
 						return
 					}
 					if req.Name != "test-model" {
 						t.Errorf("expected model name 'test-model', got %s", req.Name)
 					}
 					// Simulate progress updates
 					responses := []api.ProgressResponse{
 						{Status: "preparing manifest"},
 						{Digest: "sha256:abc123456789", Total: 100, Completed: 50},
 						{Digest: "sha256:abc123456789", Total: 100, Completed: 100},
 					}
 					for _, resp := range responses {
 						if err := json.NewEncoder(w).Encode(resp); err != nil {
 							http.Error(w, err.Error(), http.StatusInternalServerError)
 							return
 						}
 						w.(http.Flusher).Flush()
 					}
 				},
 			},
 			expectedOutput: "\nYou can find your model at:\n\n\thttps://ollama.com/test-model\n",
 		},
 		{
 			name:      "unauthorized push",
 			modelName: "unauthorized-model",
 			serverResponse: map[string]func(w http.ResponseWriter, r *http.Request){
 				"/api/push": func(w http.ResponseWriter, r *http.Request) {
 					w.Header().Set("Content-Type", "application/json")
 					w.WriteHeader(http.StatusUnauthorized)
 					err := json.NewEncoder(w).Encode(map[string]string{
 						"error": "access denied",
 					})
 					if err != nil {
 						t.Fatal(err)
 					}
 				},
 			},
 			expectedError: "you are not authorized to push to this namespace, create the model under a namespace you own",
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 				if handler, ok := tt.serverResponse[r.URL.Path]; ok {
 					handler(w, r)
 					return
 				}
 				http.Error(w, "not found", http.StatusNotFound)
 			}))
 			defer mockServer.Close()
 			t.Setenv("OLLAMA_HOST", mockServer.URL)
 			cmd := &cobra.Command{}
 			cmd.Flags().Bool("insecure", false, "")
 			cmd.SetContext(context.TODO())
 			// Redirect stderr to capture progress output
 			oldStderr := os.Stderr
 			r, w, _ := os.Pipe()
 			os.Stderr = w
 			// Capture stdout for the "Model pushed" message
 			oldStdout := os.Stdout
 			outR, outW, _ := os.Pipe()
 			os.Stdout = outW
 			err := PushHandler(cmd, []string{tt.modelName})
 			// Restore stderr
 			w.Close()
 			os.Stderr = oldStderr
 			// drain the pipe
 			if _, err := io.ReadAll(r); err != nil {
 				t.Fatal(err)
 			}
 			// Restore stdout and get output
 			outW.Close()
 			os.Stdout = oldStdout
 			stdout, _ := io.ReadAll(outR)
 			if tt.expectedError == "" {
 				if err != nil {
 					t.Errorf("expected no error, got %v", err)
 				}
 				if tt.expectedOutput != "" {
 					if got := string(stdout); got != tt.expectedOutput {
 						t.Errorf("expected output %q, got %q", tt.expectedOutput, got)
 					}
 				}
 			} else {
 				if err == nil || !strings.Contains(err.Error(), tt.expectedError) {
 					t.Errorf("expected error containing %q, got %v", tt.expectedError, err)
 				}
 			}
 		})
 	}
 }
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@ -319,8 +319,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 						opts.Messages = append(opts.Messages, newMessage)
 					}
 					fmt.Println("Set system message.")
 					sb.Reset()
 					sb.Reset()
 					continue
 				default:
@ -516,7 +514,7 @@ func extractFileNames(input string) []string {
 	// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
 	// and followed by more characters and a file extension
 	// This will capture non filename strings, but we'll check for file existence to remove mismatches
-	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|svg)\b`
+	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
 	re := regexp.MustCompile(regexPattern)
 	return re.FindAllString(input, -1)
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@ -12,44 +12,45 @@ import (
 func TestExtractFilenames(t *testing.T) {
 	// Unix style paths
 	input := ` some preamble 
- ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2
+ ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
-/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.svg`
+/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
 	res := extractFileNames(input)
 	assert.Len(t, res, 5)
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
-	assert.Contains(t, res[4], "five.svg")
+	assert.Contains(t, res[4], "five.JPG")
 	assert.NotContains(t, res[4], '"')
-	assert.NotContains(t, res, "inbtween")
+	assert.NotContains(t, res, "inbetween1")
 	assert.NotContains(t, res, "./1.svg")
 	// Windows style paths
 	input = ` some preamble
 c:/users/jdoe/one.png inbetween1 c:/program files/someplace/two.jpg inbetween2 
 /absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
-./relative\ path/five.svg inbetween5 "./relative with/spaces/six.png inbetween6
+./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
-d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8 
+d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8 
- d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.svg some ending
+ d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
 `
 	res = extractFileNames(input)
 	assert.Len(t, res, 10)
-	assert.NotContains(t, res, "inbtween")
+	assert.NotContains(t, res, "inbetween2")
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[0], "c:")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[1], "c:")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
-	assert.Contains(t, res[4], "five.svg")
+	assert.Contains(t, res[4], "five.JPG")
 	assert.Contains(t, res[5], "six.png")
-	assert.Contains(t, res[6], "seven.svg")
+	assert.Contains(t, res[6], "seven.JPEG")
 	assert.Contains(t, res[6], "d:")
 	assert.Contains(t, res[7], "eight.png")
 	assert.Contains(t, res[7], "c:")
 	assert.Contains(t, res[8], "nine.png")
 	assert.Contains(t, res[8], "d:")
-	assert.Contains(t, res[9], "ten.svg")
+	assert.Contains(t, res[9], "ten.PNG")
 	assert.Contains(t, res[9], "E:")
 }
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@ -350,7 +350,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				return nil, err
 			}
 		}
-		gpuInfo.DependencyPath = libDir
+		gpuInfo.DependencyPath = []string{libDir}
 		if gfxOverride == "" {
 			// Only load supported list once
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@ -111,7 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				UnreliableFreeMemory: true,
 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				DependencyPath: libDir,
+				DependencyPath: []string{libDir},
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
 				Compute:        gfx,
--- a/discover/gpu.go
+++ b/discover/gpu.go
@ -240,7 +240,7 @@ func GetGPUInfo() GpuInfoList {
 					Library:        "cpu",
 					Variant:        cpuCapability.String(),
 					ID:             "0",
-					DependencyPath: depPath,
+					DependencyPath: []string{depPath},
 				},
 				CPUs: details,
 			},
@ -293,11 +293,11 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
 				if depPath != "" {
-					gpuInfo.DependencyPath = depPath
+					gpuInfo.DependencyPath = []string{depPath}
 					// Check for variant specific directory
 					if variant != "" {
 						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
-							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
+							gpuInfo.DependencyPath = []string{filepath.Join(depPath, "cuda_"+variant), depPath}
 						}
 					}
 				}
@ -370,7 +370,7 @@ func GetGPUInfo() GpuInfoList {
 						gpuInfo.FreeMemory = uint64(memInfo.free)
 						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-						gpuInfo.DependencyPath = depPath
+						gpuInfo.DependencyPath = []string{depPath}
 						oneapiGPUs = append(oneapiGPUs, gpuInfo)
 					}
 				}
--- a/discover/types.go
+++ b/discover/types.go
@ -25,7 +25,7 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	MinimumMemory uint64 `json:"-"`
 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
-	DependencyPath string `json:"lib_path,omitempty"`
+	DependencyPath []string `json:"lib_path,omitempty"`
 	// Extra environment variables specific to the GPU as list of [key,value]
 	EnvWorkarounds [][2]string `json:"envs,omitempty"`
--- a/docs/api.md
+++ b/docs/api.md
@ -830,10 +830,30 @@ Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `m
 ### Parameters
- `name`: name of the model to create
+- `model`: name of the model to create
 - `modelfile` (optional): contents of the Modelfile
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
 - `path` (optional): path to the Modelfile
 - `quantize` (optional): quantize a non-quantized (e.g. float16) model
 #### Quantization types
 | Type | Recommended |
 | --- | :-: |
 | q2_K | |
 | q3_K_L | |
 | q3_K_M | |
 | q3_K_S | |
 | q4_0 | |
 | q4_1 | |
 | q4_K_M | * |
 | q4_K_S | |
 | q5_0 | |
 | q5_1 | |
 | q5_K_M | |
 | q5_K_S | |
 | q6_K | |
 | q8_0 | * |
 ### Examples
@ -845,14 +865,14 @@ Create a new model from a `Modelfile`.
 ```shell
 curl http://localhost:11434/api/create -d '{
-  "name": "mario",
+  "model": "mario",
  "modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
 }'
 ```
 ##### Response
-A stream of JSON objects. Notice that the final JSON object shows a `"status": "success"`.
+A stream of JSON objects is returned:
 ```json
 {"status":"reading model metadata"}
@ -868,13 +888,43 @@ A stream of JSON objects. Notice that the final JSON object shows a `"status": "
 {"status":"success"}
 ```
 #### Quantize a model
 Quantize a non-quantized model.
 ##### Request
 ```shell
 curl http://localhost:11434/api/create -d '{
  "model": "llama3.1:quantized",
  "modelfile": "FROM llama3.1:8b-instruct-fp16",
  "quantize": "q4_K_M"
 }'
 ```
 ##### Response
 A stream of JSON objects is returned:
 ```
 {"status":"quantizing F16 model to Q4_K_M"}
 {"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
 {"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
 {"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
 {"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
 {"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
 {"status":"writing manifest"}
 {"status":"success"}
 ```
 ### Check if a Blob Exists
 ```shell
 HEAD /api/blobs/:digest
 ```
-Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not Ollama.ai.
+Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not ollama.com.
 #### Query Parameters
@ -979,7 +1029,7 @@ Show information about a model including details, modelfile, template, parameter
 ### Parameters
- `name`: name of the model to show
+- `model`: name of the model to show
 - `verbose`: (optional) if set to `true`, returns full data for verbose response fields
 ### Examples
@ -988,7 +1038,7 @@ Show information about a model including details, modelfile, template, parameter
 ```shell
 curl http://localhost:11434/api/show -d '{
-  "name": "llama3.2"
+  "model": "llama3.2"
 }'
 ```
@ -1068,7 +1118,7 @@ Delete a model and its data.
 ### Parameters
- `name`: model name to delete
+- `model`: model name to delete
 ### Examples
@ -1076,7 +1126,7 @@ Delete a model and its data.
 ```shell
 curl -X DELETE http://localhost:11434/api/delete -d '{
-  "name": "llama3:13b"
+  "model": "llama3:13b"
 }'
 ```
@ -1094,7 +1144,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
 ### Parameters
- `name`: name of the model to pull
+- `model`: name of the model to pull
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
@ -1104,7 +1154,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "name": "llama3.2"
+  "model": "llama3.2"
 }'
 ```
@ -1166,7 +1216,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding
 ### Parameters
- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
+- `model`: name of the model to push in the form of `<namespace>/<model>:<tag>`
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
@ -1176,7 +1226,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding
 ```shell
 curl http://localhost:11434/api/push -d '{
-  "name": "mattw/pygmalion:latest"
+  "model": "mattw/pygmalion:latest"
 }'
 ```
--- a/docs/docker.md
+++ b/docs/docker.md
@ -50,6 +50,9 @@ sudo systemctl restart docker
 docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
 ```
 > [!NOTE]  
 > If you're running on an NVIDIA JetPack system, Ollama can't automatically discover the correct JetPack version. Pass the environment variable JETSON_JETPACK=5 or JETSON_JETPACK=6 to the container to select version 5 or 6.
 ### AMD GPU
 To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
--- a/docs/import.md
+++ b/docs/import.md
@ -32,7 +32,7 @@ ollama run my-model
 Ollama supports importing adapters based on several different model architectures including:
-  * Llama (including Llama 2, Llama 3, and Llama 3.1);
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
  * Mistral (including Mistral 1, Mistral 2, and Mixtral); and
  * Gemma (including Gemma 1 and Gemma 2)
@ -67,14 +67,12 @@ ollama run my-model
 Ollama supports importing models for several different architectures including:
-  * Llama (including Llama 2, Llama 3, and Llama 3.1);
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
  * Mistral (including Mistral 1, Mistral 2, and Mixtral);
  * Gemma (including Gemma 1 and Gemma 2); and
  * Phi3
-This includes importing foundation models as well as any fine tuned models which which have been _fused_ with a foundation model.
+This includes importing foundation models as well as any fine tuned models which have been _fused_ with a foundation model.
 ## Importing a GGUF based model or adapter
 If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
@ -83,7 +81,7 @@ If you have a GGUF based model or adapter it is possible to import it into Ollam
  * converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
  * downloading a model or adapter from a place such as HuggingFace
-To import a GGUF model, create a `Modelfile` containg:
+To import a GGUF model, create a `Modelfile` containing:
 ```dockerfile
 FROM /path/to/file.gguf
--- a/docs/linux.md
+++ b/docs/linux.md
@ -112,6 +112,21 @@ sudo systemctl status ollama
 > https://www.amd.com/en/support/linux-drivers for best support of your Radeon
 > GPU.
 ## Customizing
 To customize the installation of Ollama, you can edit the systemd service file or the environment variables by running:
 ```
 sudo systemctl edit ollama
 ```
 Alternatively, create an override file manually in `/etc/systemd/system/ollama.service.d/override.conf`:
 ```ini
 [Service]
 Environment="OLLAMA_DEBUG=1"
 ```
 ## Updating
 Update Ollama by running the install script again:
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@ -120,7 +120,7 @@ FROM <model directory>
 The model directory should contain the Safetensors weights for a supported architecture.
 Currently supported model architectures:
-  * Llama (including Llama 2, Llama 3, and Llama 3.1)
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2)
  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
  * Gemma (including Gemma 1 and Gemma 2)
  * Phi3
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@ -95,13 +95,21 @@ If none of those resolve the problem, gather additional information and file an
 On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
-When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
+When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` 
 If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker.  Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.
 If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
 - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems
 - `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
 - Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`
 ## Multiple AMD GPUs
 If you experience gibberish responses when models load across multiple AMD GPUs on Linux, see the following guide.
 - https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/mgpu.html#mgpu-known-issues-and-limitations
 ## Windows Terminal Errors
 Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly.  This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect`  To resolve this problem, please update to Win 10 22H1 or newer.
--- a/docs/tutorials/fly-gpu.md
+++ b/docs/tutorials/fly-gpu.md
@ -1,83 +0,0 @@
 # Running Ollama on Fly.io GPU Instances
 Ollama runs with little to no configuration on [Fly.io GPU instances](https://fly.io/docs/gpus/gpu-quickstart/). If you don't have access to GPUs yet, you'll need to [apply for access](https://fly.io/gpu/) on the waitlist. Once you're accepted, you'll get an email with instructions on how to get started.
 Create a new app with `fly apps create`:
 ```bash
 fly apps create
 ```
 Then create a `fly.toml` file in a new folder that looks like this:
 ```toml
 app = "sparkling-violet-709"
 primary_region = "ord"
 vm.size = "a100-40gb" # see https://fly.io/docs/gpus/gpu-quickstart/ for more info
 [build]
  image = "ollama/ollama"
 [http_service]
  internal_port = 11434
  force_https = false
  auto_stop_machines = true
  auto_start_machines = true
  min_machines_running = 0
  processes = ["app"]
 [mounts]
  source = "models"
  destination = "/root/.ollama"
  initial_size = "100gb"
 ```
 Then create a [new private IPv6 address](https://fly.io/docs/reference/private-networking/#flycast-private-load-balancing) for your app:
 ```bash
 fly ips allocate-v6 --private
 ```
 Then deploy your app:
 ```bash
 fly deploy
 ```
 And finally you can access it interactively with a new Fly.io Machine:
 ```
 fly machine run -e OLLAMA_HOST=http://your-app-name.flycast --shell ollama/ollama
 ```
 ```bash
 $ ollama run openchat:7b-v3.5-fp16
 >>> How do I bake chocolate chip cookies?
 To bake chocolate chip cookies, follow these steps:
 1. Preheat the oven to 375°F (190°C) and line a baking sheet with parchment paper or silicone baking mat.
 2. In a large bowl, mix together 1 cup of unsalted butter (softened), 3/4 cup granulated sugar, and 3/4
 cup packed brown sugar until light and fluffy.
 3. Add 2 large eggs, one at a time, to the butter mixture, beating well after each addition. Stir in 1
 teaspoon of pure vanilla extract.
 4. In a separate bowl, whisk together 2 cups all-purpose flour, 1/2 teaspoon baking soda, and 1/2 teaspoon
 salt. Gradually add the dry ingredients to the wet ingredients, stirring until just combined.
 5. Fold in 2 cups of chocolate chips (or chunks) into the dough.
 6. Drop rounded tablespoons of dough onto the prepared baking sheet, spacing them about 2 inches apart.
 7. Bake for 10-12 minutes, or until the edges are golden brown. The centers should still be slightly soft.
 8. Allow the cookies to cool on the baking sheet for a few minutes before transferring them to a wire rack
 to cool completely.
 Enjoy your homemade chocolate chip cookies!
 ```
 When you set it up like this, it will automatically turn off when you're done using it. Then when you access it again, it will automatically turn back on. This is a great way to save money on GPU instances when you're not using them. If you want a persistent wake-on-use connection to your Ollama instance, you can set up a [connection to your Fly network using WireGuard](https://fly.io/docs/reference/private-networking/#discovering-apps-through-dns-on-a-wireguard-connection). Then you can access your Ollama instance at `http://your-app-name.flycast`.
 And that's it!
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@ -1,77 +0,0 @@
 # Using LangChain with Ollama using JavaScript
 In this tutorial, we are going to use JavaScript with LangChain and Ollama to learn about something just a touch more recent. In August 2023, there was a series of wildfires on Maui. There is no way an LLM trained before that time can know about this, since their training data would not include anything as recent as that. So we can find the [Wikipedia article about the fires](https://en.wikipedia.org/wiki/2023_Hawaii_wildfires) and ask questions about the contents.
 To get started, let's just use **LangChain** to ask a simple question to a model. To do this with JavaScript, we need to install **LangChain**:
 ```bash
 npm install @langchain/community
 ```
 Now we can start building out our JavaScript:
 ```javascript
 import { Ollama } from "@langchain/community/llms/ollama";
 const ollama = new Ollama({
  baseUrl: "http://localhost:11434",
  model: "llama3.2",
 });
 const answer = await ollama.invoke(`why is the sky blue?`);
 console.log(answer);
 ```
 That will get us the same thing as if we ran `ollama run llama3.2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
 ```bash
 npm install cheerio
 ```
 ```javascript
 import { CheerioWebBaseLoader } from "langchain/document_loaders/web/cheerio";
 const loader = new CheerioWebBaseLoader("https://en.wikipedia.org/wiki/2023_Hawaii_wildfires");
 const data = await loader.load();
 ```
 That will load the document. Although this page is smaller than the Odyssey, it is certainly bigger than the context size for most LLMs. So we are going to need to split into smaller pieces, and then select just the pieces relevant to our question. This is a great use for a vector datastore. In this example, we will use the **MemoryVectorStore** that is part of **LangChain**. But there is one more thing we need to get the content into the datastore. We have to run an embeddings process that converts the tokens in the text into a series of vectors. And for that, we are going to use **Tensorflow**. There is a lot of stuff going on in this one. First, install the **Tensorflow** components that we need.
 ```javascript
 npm install @tensorflow/tfjs-core@3.6.0 @tensorflow/tfjs-converter@3.6.0 @tensorflow-models/universal-sentence-encoder@1.3.3 @tensorflow/tfjs-node@4.10.0
 ```
 If you just install those components without the version numbers, it will install the latest versions, but there are conflicts within **Tensorflow**, so you need to install the compatible versions.
 ```javascript
 import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
 import { MemoryVectorStore } from "langchain/vectorstores/memory";
 import "@tensorflow/tfjs-node";
 import { TensorFlowEmbeddings } from "langchain/embeddings/tensorflow";
 // Split the text into 500 character chunks. And overlap each chunk by 20 characters
 const textSplitter = new RecursiveCharacterTextSplitter({
 chunkSize: 500,
 chunkOverlap: 20
 });
 const splitDocs = await textSplitter.splitDocuments(data);
 // Then use the TensorFlow Embedding to store these chunks in the datastore
 const vectorStore = await MemoryVectorStore.fromDocuments(splitDocs, new TensorFlowEmbeddings());
 ```
 To connect the datastore to a question asked to a LLM, we need to use the concept at the heart of **LangChain**: the chain. Chains are a way to connect a number of activities together to accomplish a particular tasks. There are a number of chain types available, but for this tutorial we are using the **RetrievalQAChain**.
 ```javascript
 import { RetrievalQAChain } from "langchain/chains";
 const retriever = vectorStore.asRetriever();
 const chain = RetrievalQAChain.fromLLM(ollama, retriever);
 const result = await chain.call({query: "When was Hawaii's request for a major disaster declaration approved?"});
 console.log(result.text)
 ```
 So we created a retriever, which is a way to return the chunks that match a query from a datastore. And then connect the retriever and the model via a chain. Finally, we send a query to the chain, which results in an answer using our document as a source. The answer it returned was correct, August 10, 2023.
 And that is a simple introduction to what you can do with **LangChain** and **Ollama.**
--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@ -1,85 +0,0 @@
 # Using LangChain with Ollama in Python
 Let's imagine we are studying the classics, such as **the Odyssey** by **Homer**. We might have a question about Neleus and his family. If you ask llama2 for that info, you may get something like:
 > I apologize, but I'm a large language model, I cannot provide information on individuals or families that do not exist in reality. Neleus is not a real person or character, and therefore does not have a family or any other personal details. My apologies for any confusion. Is there anything else I can help you with?
 This sounds like a typical censored response, but even llama2-uncensored gives a mediocre answer:
 > Neleus was a legendary king of Pylos and the father of Nestor, one of the Argonauts. His mother was Clymene, a sea nymph, while his father was Neptune, the god of the sea.
 So let's figure out how we can use **LangChain** with Ollama to ask our question to the actual document, the Odyssey by Homer, using Python.
 Let's start by asking a simple question that we can get an answer to from the **Llama3** model using **Ollama**. First, we need to install the **LangChain** package:
 `pip install langchain_community`
 Then we can create a model and ask the question:
 ```python
 from langchain_community.llms import Ollama
 ollama = Ollama(
    base_url='http://localhost:11434',
    model="llama3"
 )
 print(ollama.invoke("why is the sky blue"))
 ```
 Notice that we are defining the model and the base URL for Ollama.
 Now let's load a document to ask questions against. I'll load up the Odyssey by Homer, which you can find at Project Gutenberg. We will need **WebBaseLoader** which is part of **LangChain** and loads text from any webpage. On my machine, I also needed to install **bs4** to get that to work, so run `pip install bs4`.
 ```python
 from langchain.document_loaders import WebBaseLoader
 loader = WebBaseLoader("https://www.gutenberg.org/files/1727/1727-h/1727-h.htm")
 data = loader.load()
 ```
 This file is pretty big. Just the preface is 3000 tokens. Which means the full document won't fit into the context for the model. So we need to split it up into smaller pieces.
 ```python
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
 all_splits = text_splitter.split_documents(data)
 ```
 It's split up, but we have to find the relevant splits and then submit those to the model. We can do this by creating embeddings and storing them in a vector database. We can use Ollama directly to instantiate an embedding model. We will use ChromaDB in this example for a vector database. `pip install chromadb`
 We also need to pull embedding model: `ollama pull nomic-embed-text`
 ```python
 from langchain.embeddings import OllamaEmbeddings
 from langchain.vectorstores import Chroma
 oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="nomic-embed-text")
 vectorstore = Chroma.from_documents(documents=all_splits, embedding=oembed)
 ```
 Now let's ask a question from the document. **Who was Neleus, and who is in his family?** Neleus is a character in the Odyssey, and the answer can be found in our text.
 ```python
 question="Who is Neleus and who is in Neleus' family?"
 docs = vectorstore.similarity_search(question)
 len(docs)
 ```
 This will output the number of matches for chunks of data similar to the search.
 The next thing is to send the question and the relevant parts of the docs to the model to see if we can get a good answer. But we are stitching two parts of the process together, and that is called a chain. This means we need to define a chain:
 ```python
 from langchain.chains import RetrievalQA
 qachain=RetrievalQA.from_chain_type(ollama, retriever=vectorstore.as_retriever())
 res = qachain.invoke({"query": question})
 print(res['result'])
 ```
 The answer received from this chain was:
 > Neleus is a character in Homer's "Odyssey" and is mentioned in the context of Penelope's suitors. Neleus is the father of Chloris, who is married to Neleus and bears him several children, including Nestor, Chromius, Periclymenus, and Pero. Amphinomus, the son of Nisus, is also mentioned as a suitor of Penelope and is known for his good natural disposition and agreeable conversation.
 It's not a perfect answer, as it implies Neleus married his daughter when actually Chloris "was the youngest daughter to Amphion son of Iasus and king of Minyan Orchomenus, and was Queen in Pylos".
 I updated the chunk_overlap for the text splitter to 20 and tried again and got a much better answer:
 > Neleus is a character in Homer's epic poem "The Odyssey." He is the husband of Chloris, who is the youngest daughter of Amphion son of Iasus and king of Minyan Orchomenus. Neleus has several children with Chloris, including Nestor, Chromius, Periclymenus, and Pero.
 And that is a much better answer.
--- a/docs/tutorials/nvidia-jetson.md
+++ b/docs/tutorials/nvidia-jetson.md
@ -1,15 +0,0 @@
 # Running Ollama on NVIDIA Jetson Devices
 Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/) and should run out of the box with the standard installation instructions. 
 The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack), but should also work on JetPack 6.0.
 - Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.com/install.sh | sh`
 - Pull the model you want to use (e.g. mistral): `ollama pull mistral`
 - Start an interactive session: `ollama run mistral`
 And that's it!
 # Running Ollama in Docker
 When running GPU accelerated applications in Docker, it is highly recommended to use [dusty-nv jetson-containers repo](https://github.com/dusty-nv/jetson-containers).
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@ -1,6 +1,6 @@
 from langchain.llms import Ollama
-input = input("What is your question?")
+input = input("What is your question?\n> ")
 llm = Ollama(model="llama3.2")
-res = llm.predict(input)
+res = llm.invoke(input)
 print (res)
--- a/go.mod
+++ b/go.mod
@ -7,12 +7,12 @@ require (
 	github.com/emirpasic/gods v1.18.1
 	github.com/gin-gonic/gin v1.10.0
 	github.com/golang/protobuf v1.5.4 // indirect
-	github.com/google/uuid v1.1.2
+	github.com/google/uuid v1.6.0
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
 	github.com/x448/float16 v0.8.4
-	golang.org/x/sync v0.3.0
+	golang.org/x/sync v0.9.0
 )
 require (
@ -22,14 +22,14 @@ require (
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
-	golang.org/x/image v0.14.0
+	golang.org/x/image v0.22.0
 )
 require (
 	github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect
 	github.com/bytedance/sonic/loader v0.1.1 // indirect
 	github.com/chewxy/hm v1.0.0 // indirect
-	github.com/chewxy/math32 v1.10.1 // indirect
+	github.com/chewxy/math32 v1.11.0 // indirect
 	github.com/cloudwego/base64x v0.1.4 // indirect
 	github.com/cloudwego/iasm v0.2.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
@ -73,7 +73,7 @@ require (
 	golang.org/x/net v0.25.0 // indirect
 	golang.org/x/sys v0.20.0
 	golang.org/x/term v0.20.0
-	golang.org/x/text v0.15.0
+	golang.org/x/text v0.20.0
 	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@ -21,8 +21,8 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA
 github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
 github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
 github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
-github.com/chewxy/math32 v1.10.1 h1:LFpeY0SLJXeaiej/eIp2L40VYfscTvKh/FSEZ68uMkU=
+github.com/chewxy/math32 v1.11.0 h1:8sek2JWqeaKkVnHa7bPVqCEOUPbARo4SGxs6toKyAOo=
-github.com/chewxy/math32 v1.10.1/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
+github.com/chewxy/math32 v1.11.0/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y=
 github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w=
@ -113,8 +113,9 @@ github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
 github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
@ -230,8 +231,8 @@ golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+o
 golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/image v0.14.0 h1:tNgSxAFe3jC4uYqvZdTr84SZoM1KfwdC9SKIFrLjFn4=
+golang.org/x/image v0.22.0 h1:UtK5yLUzilVrkjMAZAZ34DXGpASN8i8pj8g+O+yd10g=
-golang.org/x/image v0.14.0/go.mod h1:HUYqC05R2ZcZ3ejNQsIHQDQiwWM4JBqmm6MKANTp4LE=
+golang.org/x/image v0.22.0/go.mod h1:9hPFhljd4zZ1GNSIZJ49sqbp45GKK9t6w+iXvGqZUz4=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
 golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
 golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
@ -265,8 +266,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
+golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
-golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
+golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@ -291,8 +292,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
+golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
-golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/integration/context_test.go
+++ b/integration/context_test.go
@ -10,7 +10,38 @@ import (
 	"github.com/ollama/ollama/api"
 )
 func TestLongInputContext(t *testing.T) {
 	// Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what
 	// we asked for and there is nothing extra that we could spill over into
 	t.Setenv("OLLAMA_NUM_PARALLEL", "1")
 	// Longer needed for small footprint GPUs
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
 		Model:  "llama2",
 		Prompt: "Oh, don’t speak to me of Austria. Perhaps I don’t understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexander’s loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I don’t believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe! What country is this referring to?",
 		Stream: &stream,
 		Options: map[string]interface{}{
 			"temperature": 0,
 			"seed":        123,
 			"num_ctx":     128,
 		},
 	}
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("PullIfMissing failed: %v", err)
 	}
 	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia"}, 120*time.Second, 10*time.Second)
 }
 func TestContextExhaustion(t *testing.T) {
 	// Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what
 	// we asked for and there is nothing extra that we could spill over into
 	t.Setenv("OLLAMA_NUM_PARALLEL", "1")
 	// Longer needed for small footprint GPUs
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@ -16,7 +16,6 @@ import (
 	"github.com/stretchr/testify/require"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 )
 func TestMaxQueue(t *testing.T) {
@ -27,12 +26,8 @@ func TestMaxQueue(t *testing.T) {
 	// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
 	// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
-	threadCount := 32
+	threadCount := 16
-	if maxQueue := envconfig.MaxQueue(); maxQueue != 0 {
+	t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
 		threadCount = int(maxQueue)
 	} else {
 		t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
 	}
 	req := api.GenerateRequest{
 		Model:  "orca-mini",
--- a/llama/README.md
+++ b/llama/README.md
@ -55,7 +55,7 @@ go build -tags avx,cuda .
 ### ROCm
-Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive):
+Install [ROCm](https://rocm.docs.amd.com/en/latest/).
 ```shell
 make ggml_hipblas.so
@ -77,7 +77,7 @@ go build -tags avx,cuda .
 ### ROCm
-Install [ROCm 5.7.1](https://rocm.docs.amd.com/en/docs-5.7.1/).
+Install [ROCm](https://rocm.docs.amd.com/en/latest/).
 ```shell
 make ggml_hipblas.dll
@ -93,7 +93,7 @@ make -j
 ## Vendoring
-Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model.  While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit.  A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
+Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model. While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit. A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
 If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.
@ -105,35 +105,35 @@ make apply-patches
 **Pin to new base commit**
-To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring.env`
+To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring`
 #### Applying patches
 When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.
-Start by applying the patches.  If any of the patches have conflicts, the `git am` will stop at the first failure.
+Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
 ```
 make apply-patches
 ```
-If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed.  Save the file(s) and continue the patch series with `git am --continue` .  If any additional patches fail, follow the same pattern until the full patch series is applied.  Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
+If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed. Save the file(s) and continue the patch series with `git am --continue` . If any additional patches fail, follow the same pattern until the full patch series is applied. Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
 ```
 make create-patches sync
 ```
-Build and test Ollama, and make any necessary changes to the Go code based on the new base commit.  Submit your PR to the Ollama repo.
+Build and test Ollama, and make any necessary changes to the Go code based on the new base commit. Submit your PR to the Ollama repo.
 ### Generating Patches
-When working on new fixes or features that impact vendored code, use the following model.  First get a clean tracking repo with all current patches applied:
+When working on new fixes or features that impact vendored code, use the following model. First get a clean tracking repo with all current patches applied:
 ```
 make apply-patches
 ```
-Now edit the upstream native code in the `./vendor/` directory.  You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing.  Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
+Now edit the upstream native code in the `./vendor/` directory. You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing. Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
 ```
 make sync
@ -142,9 +142,9 @@ go build .
 ```
 > [!IMPORTANT]
-> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo.  It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s).
+> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo. It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s).
-Iterate until you're ready to submit PRs.  Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
+Iterate until you're ready to submit PRs. Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
 ```
 make create-patches
--- a/llama/llama.go
+++ b/llama/llama.go
@ -21,6 +21,8 @@ package llama
 #cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5 -L/usr/local/cuda-11/lib64
 #cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6 -L/usr/local/cuda-12/lib64
 #cgo cuda_v11 LDFLAGS: -lggml_cuda_v11 -L/usr/local/cuda-11/lib64
 #cgo cuda_v12 LDFLAGS: -lggml_cuda_v12 -L/usr/local/cuda-12/lib64
 #cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
@ -36,8 +38,8 @@ package llama
 #cgo linux CXXFLAGS: -D_GNU_SOURCE
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
-#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
+#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
-#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -D__ARM_FEATURE_MATMUL_INT8
+#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
 #cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
 #cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
@ -155,9 +157,7 @@ type Context struct {
 	numThreads int
 }
-func (c *Context) KvCacheClear() {
+var ErrKvCacheFull = errors.New("could not find a kv cache slot")
 	C.llama_kv_cache_clear(c.c)
 }
 func (c *Context) Decode(batch *Batch) error {
 	// Positive return values does not mean a fatal error, but rather a warning.
@ -171,7 +171,7 @@ func (c *Context) Decode(batch *Batch) error {
 	}
 	if code > 0 {
-		return fmt.Errorf("could not find a KV slot for the batch - try reducing the size of the batch or increase the context. code: %d", code)
+		return ErrKvCacheFull
 	}
 	return nil
@ -193,6 +193,14 @@ func (c *Context) KvCacheSeqCp(srcSeqId int, dstSeqId int, p0 int, p1 int) {
 	C.llama_kv_cache_seq_cp(c.c, C.int(srcSeqId), C.int(dstSeqId), C.int(p0), C.int(p1))
 }
 func (c *Context) KvCacheClear() {
 	C.llama_kv_cache_clear(c.c)
 }
 func (c *Context) KvCacheDefrag() {
 	C.llama_kv_cache_defrag(c.c)
 }
 // Get the embeddings for a sequence id
 func (c *Context) GetEmbeddingsSeq(seqId int) []float32 {
 	embeddings := unsafe.Pointer(C.llama_get_embeddings_seq(c.c, C.int(seqId)))
@ -382,6 +390,8 @@ func (b *Batch) Add(token int, embed []float32, pos int, logits bool, seqIds ...
 	if logits {
 		unsafe.Slice(b.c.logits, b.allocSize())[b.c.n_tokens] = 1
 	} else {
 		unsafe.Slice(b.c.logits, b.allocSize())[b.c.n_tokens] = 0
 	}
 	b.c.n_tokens += 1
@ -598,6 +608,10 @@ func (c *Context) SetCrossAttention(state bool) {
 	C.llama_set_cross_attention(c.c, C.bool(state))
 }
 func (c *Context) Synchronize() {
 	C.llama_synchronize(c.c)
 }
 // sampling
 // TODO: this is a temporary wrapper to allow calling C++ code from CGo
 type SamplingContext struct {
--- a/llama/make/cuda.make
+++ b/llama/make/cuda.make
@ -20,7 +20,7 @@ GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
 GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
 GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
 GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
-GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_LIB_DIR)/,$(notdir $(GPU_LIBS))))
+GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
 ifeq ($(OS),linux)
 	CUDA_PATH?=/usr/local/cuda
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@ -2,6 +2,7 @@ package main
 import (
 	"errors"
 	"fmt"
 	"log/slog"
 	"reflect"
 	"time"
@ -22,7 +23,11 @@ type InputCache struct {
 	lc *llama.Context
 }
-func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) *InputCache {
+func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) (*InputCache, error) {
 	if kvSize/numSlots < 1 {
 		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
 	}
 	slots := make([]InputCacheSlot, numSlots)
 	for i := range slots {
@ -37,7 +42,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
 		slots:          slots,
 		multiUserCache: multiUserCache,
 		lc:             lc,
-	}
+	}, nil
 }
 // Locking: Operations on InputCacheSlot (including finding one
@ -58,7 +63,7 @@ type InputCacheSlot struct {
 	lastUsed time.Time
 }
-func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, int, error) {
+func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
 	var slot *InputCacheSlot
 	var numPast int
 	var err error
@ -75,7 +80,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 		slot, numPast, err = c.findBestCacheSlot(prompt)
 	}
 	if err != nil {
-		return nil, nil, 0, err
+		return nil, nil, err
 	}
 	if !cachePrompt {
@ -102,7 +107,7 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 	prompt = prompt[numPast:]
 	slot.Inputs = slot.Inputs[:numPast]
-	return slot, prompt, numPast, nil
+	return slot, prompt, nil
 }
 func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
@ -194,14 +199,48 @@ func countCommonPrefix(a []input, b []input) int {
 	return count
 }
-func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int, numDiscard int, numPast int) {
+func (c *InputCache) ShiftDiscard(inputLen int, numKeep int) int {
-	// TODO (jessegross): KV cache removal can fail for certain types of models
+	targetFree := (c.numCtx - numKeep) / 2
-	// server.cpp doesn't handle this, though we can be more graceful
+	targetFree = max(targetFree, 1)
 	c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+numDiscard)
 	c.lc.KvCacheSeqAdd(slot.Id, numKeep+numDiscard, numPast, -numDiscard)
-	for i := numKeep + numDiscard; i < len(slot.Inputs); i++ {
+	currentFree := c.numCtx - inputLen
-		slot.Inputs[i-numDiscard] = slot.Inputs[i]
+	discard := targetFree - currentFree
 	if discard < 0 {
 		discard = 0
 	}
-	slot.Inputs = slot.Inputs[:len(slot.Inputs)-numDiscard]
+
 	return discard
 }
 // Frees up space in the KV cache by deleting the oldest half of history and shifting
 // the newest half into that space (saving numKeep inputs at the beginning).
 //
 // Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
 func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) error {
 	if numKeep >= c.numCtx {
 		return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
 	}
 	discard := c.ShiftDiscard(len(slot.Inputs), numKeep)
 	if discard <= 0 {
 		return nil
 	}
 	slog.Debug("context limit hit - shifting", "id", slot.Id, "limit", c.numCtx, "input", len(slot.Inputs),
 		"keep", numKeep, "discard", discard)
 	// TODO (jessegross): KV cache removal can fail for certain types of models
 	if !c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard) {
 		return fmt.Errorf("unable to remove old kv cache entries (id: %v, keep: %v discard: %v)", slot.Id, numKeep, discard)
 	}
 	c.lc.KvCacheSeqAdd(slot.Id, numKeep+discard, len(slot.Inputs), -discard)
 	for i := numKeep + discard; i < len(slot.Inputs); i++ {
 		slot.Inputs[i-discard] = slot.Inputs[i]
 	}
 	slot.Inputs = slot.Inputs[:len(slot.Inputs)-discard]
 	return nil
 }
--- a/llama/runner/cache_test.go
+++ b/llama/runner/cache_test.go
@ -227,3 +227,66 @@ func TestFindCacheSlot(t *testing.T) {
 		})
 	}
 }
 func TestShiftDiscard(t *testing.T) {
 	tests := []struct {
 		name     string
 		numCtx   int
 		numKeep  int
 		inputLen int
 		expected int
 	}{
 		{
 			name:     "Shift",
 			numCtx:   2048,
 			numKeep:  5,
 			inputLen: 2048,
 			expected: 1021,
 		},
 		{
 			name:     "Max Keep",
 			numCtx:   2048,
 			numKeep:  2047,
 			inputLen: 2048,
 			expected: 1,
 		},
 		{
 			name:     "No Keep",
 			numCtx:   2048,
 			numKeep:  0,
 			inputLen: 2048,
 			expected: 1024,
 		},
 		{
 			name:     "Truncate",
 			numCtx:   2048,
 			numKeep:  5,
 			inputLen: 5000,
 			expected: 3973,
 		},
 		{
 			name:     "Truncate Keep",
 			numCtx:   2048,
 			numKeep:  2047,
 			inputLen: 5000,
 			expected: 2953,
 		},
 		{
 			name:     "No Op",
 			numCtx:   2048,
 			numKeep:  5,
 			inputLen: 512,
 			expected: 0,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			c := InputCache{numCtx: tt.numCtx}
 			result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
 			if result != tt.expected {
 				t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
 			}
 		})
 	}
 }
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@ -20,6 +20,8 @@ import (
 	"time"
 	"unicode/utf8"
 	"golang.org/x/sync/semaphore"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llama"
 )
@ -34,9 +36,6 @@ type input struct {
 }
 type Sequence struct {
 	// number of inputs evaluated
 	numPast int
 	// batch index
 	iBatch int
@ -46,6 +45,9 @@ type Sequence struct {
 	// prompt inputs left to evaluate
 	inputs []input
 	// inputs that have been added to a batch but not yet submitted to Decode
 	pendingInputs []input
 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
 	pendingResponses []string
@ -112,20 +114,19 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 		params.numKeep = len(inputs)
 	}
-	if !params.embedding {
+	if s.model.AddBOSToken() {
-		// Subtracting 4 ensures that at least 1 input can be discarded during shift
+		params.numKeep += 1
 		params.numKeep = min(params.numKeep, s.cache.numCtx-4)
 		params.numKeep += s.bosToken
 	} else {
 		// Embeddings are 1 shot - just truncate to the context window, without ever shifting
 		params.numKeep = min(params.numKeep, s.cache.numCtx)
 	}
-	// truncate to fit in context window
+	// Ensure that at least 1 input can be discarded during shift
 	params.numKeep = min(params.numKeep, s.cache.numCtx-1)
 	if len(inputs) > s.cache.numCtx {
-		slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep)
+		discard := len(inputs) - s.cache.numCtx
 		newInputs := inputs[:params.numKeep]
-		newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...)
+		newInputs = append(newInputs, inputs[params.numKeep+discard:]...)
 		slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "keep", params.numKeep, "new", len(newInputs))
 		inputs = newInputs
 	}
@ -163,22 +164,26 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 // generating image embeddings for each image
 func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 	var inputs []input
 	var parts []string
 	var matches [][]string
-	re := regexp.MustCompile(`\[img-(\d+)\]`)
+	if s.image != nil {
-	parts := re.Split(prompt, -1)
+		re := regexp.MustCompile(`\[img-(\d+)\]`)
-	matches := re.FindAllStringSubmatch(prompt, -1)
+		parts = re.Split(prompt, -1)
 		matches = re.FindAllStringSubmatch(prompt, -1)
 	} else {
 		parts = []string{prompt}
 	}
 	for i, part := range parts {
 		// text - tokenize
-		if strings.TrimSpace(part) != "" {
+		tokens, err := s.lc.Model().Tokenize(part, i == 0, true)
-			tokens, err := s.lc.Model().Tokenize(part, i == 0, true)
+		if err != nil {
-			if err != nil {
+			return nil, err
-				return nil, err
+		}
 			}
-			for _, t := range tokens {
+		for _, t := range tokens {
-				inputs = append(inputs, input{token: t})
+			inputs = append(inputs, input{token: t})
 			}
 		}
 		// image - generate image embedding
@ -212,41 +217,51 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 }
 type Server struct {
-	model *llama.Model
+	// is the server ready to process requests?
-	lc    *llama.Context
+	// protects access to model and image
 	ready sync.WaitGroup
-	// required for image embeddings
+	// loaded model
 	model *llama.Model
 	// image model context for multi-modal models
 	image *ImageContext
 	// status for external health reporting - loading, ready to serve, etc.
 	status ServerStatus
 	// current progress on loading the model
 	progress float32
 	// number of simultaneous requests to handle
 	parallel int
 	// maximum number of elements in a batch (per sequence)
 	// TODO (jmorganca): make this n_batch
 	batchSize int
-	// parallel is the number of parallel requests to handle
+	// protects access to everything below this line
-	parallel int
+	// this is context state needed for decoding
 	mu sync.Mutex
-	// seqs is the list of parallel sequences being evaluated
+	// indicates that data is ready for processing
-	// TODO (jmorganca): this can probably be moved into run()
+	cond *sync.Cond
 	// decoding state
 	lc *llama.Context
 	// the list of simultaneous sequences being evaluated
 	seqs []*Sequence
 	// seqs can have a maximum of parallel entries, which
 	// is enfoced by seqSem
 	seqsSem *semaphore.Weighted
 	// KV cache
 	cache *InputCache
 	// does this model require a beginning of sequence token?
 	bosToken int
 	// next sequence for prompt processing to avoid starvation
 	nextSeq int
 	// is the server ready to process requests?
 	ready sync.WaitGroup
 	mu sync.Mutex
 	cond *sync.Cond
 	progress float32
 	status ServerStatus
 }
 func (s *Server) allNil() bool {
@ -258,18 +273,6 @@ func (s *Server) allNil() bool {
 	return true
 }
 func (s *Server) shiftContext(seq *Sequence) {
 	numLeft := seq.numPast - seq.numKeep
 	numDiscard := numLeft / 2
 	slog.Debug("context limit hit - shifting", "limit", s.cache.numCtx, "numPast", seq.numPast,
 		"numKeep", seq.numKeep, "numLeft", numLeft, "numDiscard", numDiscard)
 	s.cache.ShiftCacheSlot(seq.cache, seq.numKeep, numDiscard, seq.numPast)
 	seq.numPast -= numDiscard
 }
 func flushPending(seq *Sequence) bool {
 	joined := strings.Join(seq.pendingResponses, "")
 	seq.pendingResponses = []string{}
@ -305,6 +308,7 @@ func (s *Server) removeSequence(seqIndex int, reason string) {
 	close(seq.embedding)
 	seq.cache.InUse = false
 	s.seqs[seqIndex] = nil
 	s.seqsSem.Release(1)
 }
 func (s *Server) run(ctx context.Context) {
@ -335,7 +339,11 @@ func (s *Server) run(ctx context.Context) {
 		case <-ctx.Done():
 			return
 		default:
-			s.processBatch(tokenBatch, embedBatch)
+			err := s.processBatch(tokenBatch, embedBatch)
 			if err != nil {
 				panic(err)
 			}
 			tokenBatch.Clear()
 			embedBatch.Clear()
 		}
@ -349,7 +357,7 @@ func (s *Server) run(ctx context.Context) {
 // these should instead be handled by the handlers
 // it should only be responsible for accepting tokens or embeddings and
 // processing batches as fast as possible
-func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) {
+func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) error {
 	s.mu.Lock()
 	for s.allNil() {
 		s.cond.Wait() // Wait until an item is added
@ -369,17 +377,23 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		}
 		// if past the num predict limit
-		if seq.numPredict > 0 && seq.numPredicted > seq.numPredict {
+		if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
 			s.removeSequence(seqIdx, "limit")
 			continue
 		}
 		if seq.numPast+len(seq.inputs) > s.cache.numCtx {
 			s.shiftContext(seq)
 		}
 		var numInputsProcessed int
 		for i, input := range seq.inputs {
 			if len(seq.cache.Inputs)+len(seq.pendingInputs)+1 > s.cache.numCtx {
 				if len(seq.pendingInputs) == 0 {
 					err := s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
 					if err != nil {
 						return err
 					}
 				} else {
 					break
 				}
 			}
 			embedding := input.embed != nil
 			// If we don't currently have a batch, use one of the correct type and
@ -403,28 +417,37 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			}
 			crossAttention = seq.crossAttention
-			batch.Add(input.token, input.embed, seq.numPast, numInputsProcessed+1 == len(seq.inputs), seq.cache.Id)
+			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
-			seq.numPast++
+			seq.pendingInputs = append(seq.pendingInputs, input)
 			numInputsProcessed++
 		}
 		if numInputsProcessed > 0 {
 			seq.cache.Inputs = append(seq.cache.Inputs, seq.inputs[:numInputsProcessed]...)
 			seq.inputs = seq.inputs[numInputsProcessed:]
 			seq.iBatch = batch.NumTokens() - 1
 		}
 		seq.inputs = seq.inputs[len(seq.pendingInputs):]
 	}
 	if batch == nil || batch.NumTokens() == 0 {
-		return
+		return nil
 	}
 	s.lc.SetCrossAttention(crossAttention)
 	err := s.lc.Decode(batch)
 	if err != nil {
-		slog.Error("failed to decode batch", "error", err)
+		if errors.Is(err, llama.ErrKvCacheFull) {
-		return
+			slog.Debug("defragmenting kv cache")
 			s.cache.lc.KvCacheDefrag()
 			err = s.lc.Decode(batch)
 		}
 		if err != nil {
 			return fmt.Errorf("failed to decode batch: %w", err)
 		}
 	}
 	if crossAttention {
 		// synchronize state to ensure the cross attention batch is complete.
 		// needed specifically for multi-GPU systems otherwise an inflight
 		// task may be incorrectly invalidated causing a crash
 		s.lc.Synchronize()
 	}
 	for i, seq := range s.seqs {
@ -432,6 +455,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}
 		// After calling Decode, pending inputs are now in the cache
 		if len(seq.pendingInputs) > 0 {
 			seq.cache.Inputs = append(seq.cache.Inputs, seq.pendingInputs...)
 			seq.pendingInputs = []input{}
 		}
 		// don't sample prompt processing
 		if len(seq.inputs) != 0 {
 			continue
@ -444,7 +473,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		// if done processing the prompt, generate an embedding and return
 		if seq.embeddingOnly {
-			embed := s.lc.GetEmbeddingsSeq(i)
+			embed := s.lc.GetEmbeddingsSeq(seq.cache.Id)
 			if embed == nil {
 				embed = s.lc.GetEmbeddingsIth(seq.iBatch)
 			}
@ -514,6 +543,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			s.removeSequence(i, "connection")
 		}
 	}
 	return nil
 }
 // TODO (jmorganca): use structs from the api package to avoid duplication
@ -627,12 +658,21 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}
-	// TODO (jmorganca): add to sequence queue instead of
+	// Ensure there is a place to put the sequence, released when removed from s.seqs
-	// failing if a slot isn't available
+	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
 		if errors.Is(err, context.Canceled) {
 			slog.Info("aborting completion request due to client closing the connection")
 		} else {
 			slog.Error("Failed to acquire semaphore", "error", err)
 		}
 		return
 	}
 	s.mu.Lock()
 	found := false
 	for i, sq := range s.seqs {
 		if sq == nil {
-			seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
+			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
 			if err != nil {
 				s.mu.Unlock()
 				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
@ -643,11 +683,17 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 			s.seqs[i] = seq
 			s.cond.Signal()
 			found = true
 			break
 		}
 	}
 	s.mu.Unlock()
 	if !found {
 		http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
 		return
 	}
 	for {
 		select {
 		case <-r.Context().Done():
@ -711,11 +757,21 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 		return
 	}
-	// TODO (jessegross): Wait for a free slot instead of failing and blocking forever
+	// Ensure there is a place to put the sequence, released when removed from s.seqs
 	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
 		if errors.Is(err, context.Canceled) {
 			slog.Info("aborting embeddings request due to client closing the connection")
 		} else {
 			slog.Error("Failed to acquire semaphore", "error", err)
 		}
 		return
 	}
 	s.mu.Lock()
 	found := false
 	for i, sq := range s.seqs {
 		if sq == nil {
-			seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
+			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
 			if err != nil {
 				s.mu.Unlock()
 				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
@ -723,11 +779,17 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 			}
 			s.seqs[i] = seq
 			s.cond.Signal()
 			found = true
 			break
 		}
 	}
 	s.mu.Unlock()
 	if !found {
 		http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
 		return
 	}
 	embedding := <-seq.embedding
 	if err := json.NewEncoder(w).Encode(&EmbeddingResponse{
@ -771,10 +833,21 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
 	}
 }
 type multiLPath []string
 func (m *multiLPath) Set(value string) error {
 	*m = append(*m, value)
 	return nil
 }
 func (m *multiLPath) String() string {
 	return strings.Join(*m, ", ")
 }
 func (s *Server) loadModel(
 	params llama.ModelParams,
 	mpath string,
-	lpath string,
+	lpath multiLPath,
 	ppath string,
 	kvSize int,
 	flashAttention bool,
@ -795,17 +868,15 @@ func (s *Server) loadModel(
 		panic(err)
 	}
-	if lpath != "" {
+	if lpath.String() != "" {
-		err := s.model.ApplyLoraFromFile(s.lc, lpath, 1.0, threads)
+		for _, path := range lpath {
-		if err != nil {
+			err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
-			panic(err)
+			if err != nil {
 				panic(err)
 			}
 		}
 	}
 	if s.model.AddBOSToken() {
 		s.bosToken = 1
 	}
 	if ppath != "" {
 		var err error
 		s.image, err = NewImageContext(s.lc, ppath)
@ -814,7 +885,10 @@ func (s *Server) loadModel(
 		}
 	}
-	s.cache = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
+	s.cache, err = NewInputCache(s.lc, kvSize, s.parallel, multiUserCache)
 	if err != nil {
 		panic(err)
 	}
 	s.status = ServerStatusReady
 	s.ready.Done()
@ -829,7 +903,6 @@ func main() {
 	mainGpu := flag.Int("main-gpu", 0, "Main GPU")
 	flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
 	kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
 	lpath := flag.String("lora", "", "Path to lora layer file")
 	port := flag.Int("port", 8080, "Port to expose the server on")
 	threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
 	verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
@ -839,6 +912,9 @@ func main() {
 	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
 	requirements := flag.Bool("requirements", false, "print json requirement information")
 	var lpaths multiLPath
 	flag.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
 	flag.Parse()
 	if *requirements {
 		printRequirements(os.Stdout)
@ -867,6 +943,7 @@ func main() {
 		batchSize: *batchSize,
 		parallel:  *parallel,
 		seqs:      make([]*Sequence, *parallel),
 		seqsSem:   semaphore.NewWeighted(int64(*parallel)),
 		status:    ServerStatusLoadingModel,
 	}
@ -884,7 +961,7 @@ func main() {
 	params := llama.ModelParams{
 		NumGpuLayers: *nGpuLayers,
 		MainGpu:      *mainGpu,
-		UseMmap:      !*noMmap && *lpath == "",
+		UseMmap:      !*noMmap && lpaths.String() == "",
 		UseMlock:     *mlock,
 		TensorSplit:  tensorSplitFloats,
 		Progress: func(progress float32) {
@ -893,7 +970,7 @@ func main() {
 	}
 	server.ready.Add(1)
-	go server.loadModel(params, *mpath, *lpath, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)
+	go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)
 	server.cond = sync.NewCond(&server.mu)
--- a/llm/filetype.go
+++ b/llm/filetype.go
@ -32,9 +32,10 @@ const (
 	fileTypeIQ1_S
 	fileTypeIQ4_NL
 	fileTypeIQ3_S
 	fileTypeIQ3_M
 	fileTypeIQ2_S
 	fileTypeIQ4_XS
 	fileTypeIQ2_M
 	fileTypeIQ4_XS
 	fileTypeIQ1_M
 	fileTypeBF16
@ -93,6 +94,8 @@ func ParseFileType(s string) (fileType, error) {
 		return fileTypeIQ4_NL, nil
 	case "IQ3_S":
 		return fileTypeIQ3_S, nil
 	case "IQ3_M":
 		return fileTypeIQ3_M, nil
 	case "IQ2_S":
 		return fileTypeIQ2_S, nil
 	case "IQ4_XS":
@ -160,6 +163,8 @@ func (t fileType) String() string {
 		return "IQ4_NL"
 	case fileTypeIQ3_S:
 		return "IQ3_S"
 	case fileTypeIQ3_M:
 		return "IQ3_M"
 	case fileTypeIQ2_S:
 		return "IQ2_S"
 	case fileTypeIQ4_XS:
--- a/llm/server.go
+++ b/llm/server.go
@ -144,10 +144,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	// Loop through potential servers
 	finalErr := errors.New("no suitable llama servers found")
 	if len(adapters) > 1 {
 		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
 	}
 	rDir, err := runners.Refresh(build.EmbedFS)
 	if err != nil {
 		return nil, err
@ -201,8 +197,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	}
 	if len(adapters) > 0 {
-		// TODO: applying multiple adapters is not supported by the llama.cpp server yet
+		for _, adapter := range adapters {
-		params = append(params, "--lora", adapters[0])
+			params = append(params, "--lora", adapter)
 		}
 	}
 	if len(projectors) > 0 {
@ -306,9 +303,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		// Note: we always put the dependency path first
 		// since this was the exact version we compiled/linked against
-		if gpus[0].DependencyPath != "" {
+		if gpus[0].DependencyPath != nil {
 			// assume gpus from the same library have the same dependency path
-			libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
+			libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
 		}
 		server := filepath.Join(dir, "ollama_llama_server")
@ -687,7 +684,11 @@ type CompletionResponse struct {
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 	if err := s.sem.Acquire(ctx, 1); err != nil {
-		slog.Error("Failed to acquire semaphore", "error", err)
+		if errors.Is(err, context.Canceled) {
 			slog.Info("aborting completion request due to client closing the connection")
 		} else {
 			slog.Error("Failed to acquire semaphore", "error", err)
 		}
 		return err
 	}
 	defer s.sem.Release(1)
@ -838,13 +839,15 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	}
 	if err := scanner.Err(); err != nil {
-		if strings.Contains(err.Error(), "unexpected EOF") {
+		if strings.Contains(err.Error(), "unexpected EOF") || strings.Contains(err.Error(), "forcibly closed") {
 			s.Close()
-			msg := ""
+			var msg string
 			if s.status != nil && s.status.LastErrMsg != "" {
 				msg = s.status.LastErrMsg
 			} else {
 				msg = err.Error()
 			}
-			return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
+			return fmt.Errorf("an error was encountered while running the model: %s", msg)
 		}
 		return fmt.Errorf("error reading llm response: %v", err)
@ -863,7 +866,11 @@ type EmbeddingResponse struct {
 func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
 	if err := s.sem.Acquire(ctx, 1); err != nil {
-		slog.Error("Failed to acquire semaphore", "error", err)
+		if errors.Is(err, context.Canceled) {
 			slog.Info("aborting embedding request due to client closing the connection")
 		} else {
 			slog.Error("Failed to acquire semaphore", "error", err)
 		}
 		return nil, err
 	}
 	defer s.sem.Release(1)
@ -1092,7 +1099,9 @@ func (s *llmServer) EstimatedTotal() uint64 {
 func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
 	for i, gpu := range s.gpus {
 		if gpu.ID == gpuID {
-			return s.estimate.GPUSizes[i]
+			if i < len(s.estimate.GPUSizes) {
 				return s.estimate.GPUSizes[i]
 			}
 		}
 	}
 	return 0
--- a/llm/status.go
+++ b/llm/status.go
@ -27,6 +27,7 @@ var errorPrefixes = []string{
 	"\"ERR\"",
 	"error loading model",
 	"GGML_ASSERT",
 	"Deepseek2 does not support K-shift",
 }
 func (w *StatusWriter) Write(b []byte) (int, error) {
--- a/openai/openai.go
+++ b/openai/openai.go
@ -140,6 +140,7 @@ type CompletionChunk struct {
 type ToolCall struct {
 	ID       string `json:"id"`
 	Index    int    `json:"index"`
 	Type     string `json:"type"`
 	Function struct {
 		Name      string `json:"name"`
@ -200,12 +201,13 @@ func toolCallId() string {
 	return "call_" + strings.ToLower(string(b))
 }
-func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
+func toToolCalls(tc []api.ToolCall) []ToolCall {
-	toolCalls := make([]ToolCall, len(r.Message.ToolCalls))
+	toolCalls := make([]ToolCall, len(tc))
-	for i, tc := range r.Message.ToolCalls {
+	for i, tc := range tc {
 		toolCalls[i].ID = toolCallId()
 		toolCalls[i].Type = "function"
 		toolCalls[i].Function.Name = tc.Function.Name
 		toolCalls[i].Index = tc.Function.Index
 		args, err := json.Marshal(tc.Function.Arguments)
 		if err != nil {
@ -215,7 +217,11 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 		toolCalls[i].Function.Arguments = string(args)
 	}
 	return toolCalls
 }
 func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 	toolCalls := toToolCalls(r.Message.ToolCalls)
 	return ChatCompletion{
 		Id:                id,
 		Object:            "chat.completion",
@ -244,6 +250,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 }
 func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
 	toolCalls := toToolCalls(r.Message.ToolCalls)
 	return ChatCompletionChunk{
 		Id:                id,
 		Object:            "chat.completion.chunk",
@ -252,7 +259,7 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
 		SystemFingerprint: "fp_ollama",
 		Choices: []ChunkChoice{{
 			Index: 0,
-			Delta: Message{Role: "assistant", Content: r.Message.Content},
+			Delta: Message{Role: "assistant", Content: r.Message.Content, ToolCalls: toolCalls},
 			FinishReason: func(reason string) *string {
 				if len(reason) > 0 {
 					return &reason
@ -571,7 +578,7 @@ type EmbedWriter struct {
 	model string
 }
-func (w *BaseWriter) writeError(code int, data []byte) (int, error) {
+func (w *BaseWriter) writeError(data []byte) (int, error) {
 	var serr api.StatusError
 	err := json.Unmarshal(data, &serr)
 	if err != nil {
@ -630,7 +637,7 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {
 func (w *ChatWriter) Write(data []byte) (int, error) {
 	code := w.ResponseWriter.Status()
 	if code != http.StatusOK {
-		return w.writeError(code, data)
+		return w.writeError(data)
 	}
 	return w.writeResponse(data)
@ -679,7 +686,7 @@ func (w *CompleteWriter) writeResponse(data []byte) (int, error) {
 func (w *CompleteWriter) Write(data []byte) (int, error) {
 	code := w.ResponseWriter.Status()
 	if code != http.StatusOK {
-		return w.writeError(code, data)
+		return w.writeError(data)
 	}
 	return w.writeResponse(data)
@ -704,7 +711,7 @@ func (w *ListWriter) writeResponse(data []byte) (int, error) {
 func (w *ListWriter) Write(data []byte) (int, error) {
 	code := w.ResponseWriter.Status()
 	if code != http.StatusOK {
-		return w.writeError(code, data)
+		return w.writeError(data)
 	}
 	return w.writeResponse(data)
@ -730,7 +737,7 @@ func (w *RetrieveWriter) writeResponse(data []byte) (int, error) {
 func (w *RetrieveWriter) Write(data []byte) (int, error) {
 	code := w.ResponseWriter.Status()
 	if code != http.StatusOK {
-		return w.writeError(code, data)
+		return w.writeError(data)
 	}
 	return w.writeResponse(data)
@ -755,7 +762,7 @@ func (w *EmbedWriter) writeResponse(data []byte) (int, error) {
 func (w *EmbedWriter) Write(data []byte) (int, error) {
 	code := w.ResponseWriter.Status()
 	if code != http.StatusOK {
-		return w.writeError(code, data)
+		return w.writeError(data)
 	}
 	return w.writeResponse(data)
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@ -195,7 +195,86 @@ func TestChatMiddleware(t *testing.T) {
 				Stream: &False,
 			},
 		},
-
+		{
 			name: "chat handler with streaming tools",
 			body: `{
 				"model": "test-model",
 				"messages": [
 					{"role": "user", "content": "What's the weather like in Paris?"}
 				],
 				"stream": true,
 				"tools": [{
 					"type": "function",
 					"function": {
 						"name": "get_weather",
 						"description": "Get the current weather",
 						"parameters": {
 							"type": "object",
 							"required": ["location"],
 							"properties": {
 								"location": {
 									"type": "string",
 									"description": "The city and state"
 								},
 								"unit": {
 									"type": "string",
 									"enum": ["celsius", "fahrenheit"]
 								}
 							}
 						}
 					}
 				}]
 			}`,
 			req: api.ChatRequest{
 				Model: "test-model",
 				Messages: []api.Message{
 					{
 						Role:    "user",
 						Content: "What's the weather like in Paris?",
 					},
 				},
 				Tools: []api.Tool{
 					{
 						Type: "function",
 						Function: api.ToolFunction{
 							Name:        "get_weather",
 							Description: "Get the current weather",
 							Parameters: struct {
 								Type       string   `json:"type"`
 								Required   []string `json:"required"`
 								Properties map[string]struct {
 									Type        string   `json:"type"`
 									Description string   `json:"description"`
 									Enum        []string `json:"enum,omitempty"`
 								} `json:"properties"`
 							}{
 								Type:     "object",
 								Required: []string{"location"},
 								Properties: map[string]struct {
 									Type        string   `json:"type"`
 									Description string   `json:"description"`
 									Enum        []string `json:"enum,omitempty"`
 								}{
 									"location": {
 										Type:        "string",
 										Description: "The city and state",
 									},
 									"unit": {
 										Type: "string",
 										Enum: []string{"celsius", "fahrenheit"},
 									},
 								},
 							},
 						},
 					},
 				},
 				Options: map[string]any{
 					"temperature": 1.0,
 					"top_p":       1.0,
 				},
 				Stream: &True,
 			},
 		},
 		{
 			name: "chat handler error forwarding",
 			body: `{
--- a/parser/parser.go
+++ b/parser/parser.go
@ -65,9 +65,22 @@ var (
 	errInvalidCommand     = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
 )
 type ParserError struct {
 	LineNumber int
 	Msg        string
 }
 func (e *ParserError) Error() string {
 	if e.LineNumber > 0 {
 		return fmt.Sprintf("(line %d): %s", e.LineNumber, e.Msg)
 	}
 	return e.Msg
 }
 func ParseFile(r io.Reader) (*File, error) {
 	var cmd Command
 	var curr state
 	var currLine int = 1
 	var b bytes.Buffer
 	var role string
@ -84,11 +97,18 @@ func ParseFile(r io.Reader) (*File, error) {
 			return nil, err
 		}
 		if isNewline(r) {
 			currLine++
 		}
 		next, r, err := parseRuneForState(r, curr)
 		if errors.Is(err, io.ErrUnexpectedEOF) {
 			return nil, fmt.Errorf("%w: %s", err, b.String())
 		} else if err != nil {
-			return nil, err
+			return nil, &ParserError{
 				LineNumber: currLine,
 				Msg:        err.Error(),
 			}
 		}
 		// process the state transition, some transitions need to be intercepted and redirected
@ -96,7 +116,10 @@ func ParseFile(r io.Reader) (*File, error) {
 			switch curr {
 			case stateName:
 				if !isValidCommand(b.String()) {
-					return nil, errInvalidCommand
+					return nil, &ParserError{
 						LineNumber: currLine,
 						Msg:        errInvalidCommand.Error(),
 					}
 				}
 				// next state sometimes depends on the current buffer value
@ -117,7 +140,10 @@ func ParseFile(r io.Reader) (*File, error) {
 				cmd.Name = b.String()
 			case stateMessage:
 				if !isValidMessageRole(b.String()) {
-					return nil, errInvalidMessageRole
+					return nil, &ParserError{
 						LineNumber: currLine,
 						Msg:        errInvalidMessageRole.Error(),
 					}
 				}
 				role = b.String()
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@ -3,6 +3,7 @@ package parser
 import (
 	"bytes"
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 	"strings"
@ -180,8 +181,15 @@ func TestParseFileBadCommand(t *testing.T) {
 FROM foo
 BADCOMMAND param1 value1
 `
 	parserError := &ParserError{
 		LineNumber: 3,
 		Msg:        errInvalidCommand.Error(),
 	}
 	_, err := ParseFile(strings.NewReader(input))
-	require.ErrorIs(t, err, errInvalidCommand)
+	if !errors.As(err, &parserError) {
 		t.Errorf("unexpected error: expected: %s, actual: %s", parserError.Error(), err.Error())
 	}
 }
 func TestParseFileMessages(t *testing.T) {
@ -245,7 +253,10 @@ FROM foo
 MESSAGE badguy I'm a bad guy!
 `,
 			nil,
-			errInvalidMessageRole,
+			&ParserError{
 				LineNumber: 3,
 				Msg:        errInvalidMessageRole.Error(),
 			},
 		},
 		{
 			`
@ -264,13 +275,35 @@ MESSAGE system`,
 		},
 	}
-	for _, c := range cases {
+	for _, tt := range cases {
 		t.Run("", func(t *testing.T) {
-			modelfile, err := ParseFile(strings.NewReader(c.input))
+			modelfile, err := ParseFile(strings.NewReader(tt.input))
-			require.ErrorIs(t, err, c.err)
+
 			if modelfile != nil {
-				assert.Equal(t, c.expected, modelfile.Commands)
+				assert.Equal(t, tt.expected, modelfile.Commands)
 			}
 			if tt.err == nil {
 				if err != nil {
 					t.Fatalf("expected no error, but got %v", err)
 				}
 				return
 			}
 			switch tt.err.(type) {
 			case *ParserError:
 				var pErr *ParserError
 				if errors.As(err, &pErr) {
 					// got the correct type of error
 					return
 				}
 			}
 			if errors.Is(err, tt.err) {
 				return
 			}
 			t.Fatalf("unexpected error: expected: %v, actual: %v", tt.err, err)
 		})
 	}
 }
--- a/scripts/env.sh
+++ b/scripts/env.sh
@ -5,7 +5,6 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$V
 # TODO - consider `docker buildx ls --format=json` to autodiscover platform capability
 PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"}
 DOCKER_ORG=${DOCKER_ORG:-"ollama"}
 RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
 FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
 OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
    --build-arg=GOFLAGS \
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -4,9 +4,12 @@
 set -eu
 red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
 plain="$( (/usr/bin/tput sgr0 || :) 2>&-)"
 status() { echo ">>> $*" >&2; }
-error() { echo "ERROR $*"; exit 1; }
+error() { echo "${red}ERROR:${plain} $*"; exit 1; }
-warning() { echo "WARNING: $*"; }
+warning() { echo "${red}WARNING:${plain} $*"; }
 TEMP_DIR=$(mktemp -d)
 cleanup() { rm -rf $TEMP_DIR; }
@ -93,6 +96,22 @@ else
    fi
 fi
 # Check for NVIDIA JetPack systems with additional downloads
 if [ -f /etc/nv_tegra_release ] ; then
    if grep R36 /etc/nv_tegra_release > /dev/null ; then
        status "Downloading JetPack 6 components"
        curl --fail --show-error --location --progress-bar \
            "https://ollama.com/download/ollama-linux-${ARCH}-jetpack6.tgz${VER_PARAM}" | \
            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
    elif grep R35 /etc/nv_tegra_release > /dev/null ; then
        status "Downloading JetPack 5 components"
        curl --fail --show-error --location --progress-bar \
            "https://ollama.com/download/ollama-linux-${ARCH}-jetpack5.tgz${VER_PARAM}" | \
            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
    else
        warning "Unsupported JetPack version detected.  GPU may not be supported"
    fi
 fi
 install_success() {
    status 'The Ollama API is now available at 127.0.0.1:11434.'
@ -146,6 +165,12 @@ EOF
            start_service() { $SUDO systemctl restart ollama; }
            trap start_service EXIT
            ;;
        *)
            warning "systemd is not running"
            if [ "$IS_WSL2" = true ]; then
                warning "see https://learn.microsoft.com/en-us/windows/wsl/systemd#how-to-enable-systemd to enable it"
            fi
            ;;
    esac
 }
@ -163,6 +188,13 @@ if [ "$IS_WSL2" = true ]; then
    exit 0
 fi
 # Don't attempt to install drivers on Jetson systems
 if [ -f /etc/nv_tegra_release ] ; then
    status "NVIDIA JetPack ready."
    install_success
    exit 0
 fi
 # Install GPU dependencies on Linux
 if ! available lspci && ! available lshw; then
    warning "Unable to detect NVIDIA/AMD GPU. Install lspci or lshw to automatically detect and install GPU dependencies."
--- a/server/images.go
+++ b/server/images.go
@ -5,7 +5,6 @@ import (
 	"cmp"
 	"context"
 	"crypto/sha256"
 	"encoding/base64"
 	"encoding/hex"
 	"encoding/json"
 	"errors"
@ -13,6 +12,7 @@ import (
 	"io"
 	"log"
 	"log/slog"
 	"net"
 	"net/http"
 	"net/url"
 	"os"
@ -23,14 +23,12 @@ import (
 	"strings"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@ -984,37 +982,7 @@ func GetSHA256Digest(r io.Reader) (string, int64) {
 var errUnauthorized = errors.New("unauthorized: access denied")
 // getTokenSubject returns the subject of a JWT token, it does not validate the token
 func getTokenSubject(token string) string {
 	parts := strings.Split(token, ".")
 	if len(parts) != 3 {
 		return ""
 	}
 	payload := parts[1]
 	payloadBytes, err := base64.RawURLEncoding.DecodeString(payload)
 	if err != nil {
 		slog.Error(fmt.Sprintf("failed to decode jwt payload: %v", err))
 		return ""
 	}
 	var payloadMap map[string]interface{}
 	if err := json.Unmarshal(payloadBytes, &payloadMap); err != nil {
 		slog.Error(fmt.Sprintf("failed to unmarshal payload JSON: %v", err))
 		return ""
 	}
 	sub, ok := payloadMap["sub"]
 	if !ok {
 		slog.Error("jwt does not contain 'sub' field")
 		return ""
 	}
 	return fmt.Sprintf("%s", sub)
 }
 func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.ReadSeeker, regOpts *registryOptions) (*http.Response, error) {
 	anonymous := true // access will default to anonymous if no user is found associated with the public key
 	for range 2 {
 		resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
 		if err != nil {
@ -1035,7 +1003,6 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 			if err != nil {
 				return nil, err
 			}
 			anonymous = getTokenSubject(token) == "anonymous"
 			regOpts.Token = token
 			if body != nil {
 				_, err = body.Seek(0, io.SeekStart)
@ -1058,19 +1025,24 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 		}
 	}
 	if anonymous {
 		// no user is associated with the public key, and the request requires non-anonymous access
 		pubKey, nestedErr := auth.GetPublicKey()
 		if nestedErr != nil {
 			slog.Error(fmt.Sprintf("couldn't get public key: %v", nestedErr))
 			return nil, errUnauthorized
 		}
 		return nil, &errtypes.UnknownOllamaKey{Key: pubKey}
 	}
 	// user is associated with the public key, but is not authorized to make the request
 	return nil, errUnauthorized
 }
 // testMakeRequestDialContext specifies the dial function for the http client in
 // makeRequest. It can be used to resolve hosts in model names to local
 // addresses for testing. For example, the model name ("example.com/my/model")
 // can be directed to push/pull from "127.0.0.1:1234".
 //
 // This is not safe to set across goroutines. It should be set in
 // the main test goroutine, and not by tests marked to run in parallel with
 // t.Parallel().
 //
 // It should be cleared after use, otherwise it will affect other tests.
 //
 // Ideally we would have some set this up the stack, but the code is not
 // structured in a way that makes this easy, so this will have to do for now.
 var testMakeRequestDialContext func(ctx context.Context, network, addr string) (net.Conn, error)
 func makeRequest(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.Reader, regOpts *registryOptions) (*http.Response, error) {
 	if requestURL.Scheme != "http" && regOpts != nil && regOpts.Insecure {
 		requestURL.Scheme = "http"
@ -1104,14 +1076,15 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
 		req.ContentLength = contentLength
 	}
-	resp, err := (&http.Client{
+	c := &http.Client{
 		CheckRedirect: regOpts.CheckRedirect,
 	}).Do(req)
 	if err != nil {
 		return nil, err
 	}
-
+	if testMakeRequestDialContext != nil {
-	return resp, nil
+		tr := http.DefaultTransport.(*http.Transport).Clone()
 		tr.DialContext = testMakeRequestDialContext
 		c.Transport = tr
 	}
 	return c.Do(req)
 }
 func getValue(header, key string) string {
--- a/server/model_test.go
+++ b/server/model_test.go
@ -39,6 +39,7 @@ func TestExecuteWithTools(t *testing.T) {
 		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]
 The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, true},
 		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"To }]`, false},
 		{"mistral", `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function:
 		[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@ -32,7 +32,7 @@ func TestChatPrompt(t *testing.T) {
 	mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
 	createImg := func(width, height int) ([]byte, error) {
-		img := image.NewRGBA(image.Rect(0, 0, 5, 5))
+		img := image.NewRGBA(image.Rect(0, 0, width, height))
 		var buf bytes.Buffer
 		if err := png.Encode(&buf, img); err != nil {
--- a/server/routes.go
+++ b/server/routes.go
@ -507,7 +507,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	embedding, err := r.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
 		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
-		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
+		c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Errorf("failed to generate embedding: %v", err)})
 		return
 	}
@ -540,7 +540,8 @@ func (s *Server) PullHandler(c *gin.Context) {
 		return
 	}
-	if err := checkNameExists(name); err != nil {
+	name, err = getExistingName(name)
 	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@ -621,19 +622,20 @@ func (s *Server) PushHandler(c *gin.Context) {
 	streamResponse(c, ch)
 }
-func checkNameExists(name model.Name) error {
+// getExistingName returns the original, on disk name if the input name is a
-	names, err := Manifests(true)
+// case-insensitive match, otherwise it returns the input name.
 func getExistingName(n model.Name) (model.Name, error) {
 	var zero model.Name
 	existing, err := Manifests(true)
 	if err != nil {
-		return err
+		return zero, err
 	}
-
+	for e := range existing {
-	for n := range names {
+		if n.EqualFold(e) {
-		if strings.EqualFold(n.Filepath(), name.Filepath()) && n != name {
+			return e, nil
 			return errors.New("a model with that name already exists")
 		}
 	}
-
+	return n, nil
 	return nil
 }
 func (s *Server) CreateHandler(c *gin.Context) {
@ -652,7 +654,8 @@ func (s *Server) CreateHandler(c *gin.Context) {
 		return
 	}
-	if err := checkNameExists(name); err != nil {
+	name, err := getExistingName(name)
 	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@ -958,14 +961,19 @@ func (s *Server) CopyHandler(c *gin.Context) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("source %q is invalid", r.Source)})
 		return
 	}
 	src, err := getExistingName(src)
 	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
 	dst := model.ParseName(r.Destination)
 	if !dst.IsValid() {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("destination %q is invalid", r.Destination)})
 		return
 	}
-
+	dst, err = getExistingName(dst)
-	if err := checkNameExists(dst); err != nil {
+	if err != nil {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
@ -1133,7 +1141,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	config.AllowWildcard = true
 	config.AllowBrowserExtensions = true
 	config.AllowHeaders = []string{"Authorization", "Content-Type", "User-Agent", "Accept", "X-Requested-With"}
-	openAIProperties := []string{"lang", "package-version", "os", "arch", "runtime", "runtime-version", "async"}
+	openAIProperties := []string{"lang", "package-version", "os", "arch", "retry-count", "runtime", "runtime-version", "async"}
 	for _, prop := range openAIProperties {
 		config.AllowHeaders = append(config.AllowHeaders, "x-stainless-"+prop)
 	}
@ -1450,6 +1458,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
 	if err != nil {
 		slog.Error("chat prompt error", "error", err)
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@ -1459,6 +1468,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	ch := make(chan any)
 	go func() {
 		defer close(ch)
 		var sb strings.Builder
 		var toolCallIndex int = 0
 		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
 			Prompt:  prompt,
 			Images:  images,
@ -1484,7 +1495,37 @@ func (s *Server) ChatHandler(c *gin.Context) {
 				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
 			}
-			ch <- res
+			// TODO: tool call checking and filtering should be moved outside of this callback once streaming
 			// however this was a simple change for now without reworking streaming logic of this (and other)
 			// handlers
 			if req.Stream != nil && !*req.Stream || len(req.Tools) == 0 {
 				ch <- res
 				return
 			}
 			// Streaming tool calls:
 			// If tools are recognized, use a flag to track the sending of a tool downstream
 			// This ensures that content is cleared from the message on the last chunk sent
 			sb.WriteString(r.Content)
 			if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
 				res.Message.ToolCalls = toolCalls
 				for i := range toolCalls {
 					toolCalls[i].Function.Index = toolCallIndex
 					toolCallIndex++
 				}
 				res.Message.Content = ""
 				sb.Reset()
 				ch <- res
 				return
 			}
 			if r.Done {
 				// Send any remaining content if no tool calls were detected
 				if toolCallIndex == 0 {
 					res.Message.Content = sb.String()
 				}
 				ch <- res
 			}
 		}); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@ -8,6 +8,7 @@ import (
 	"io"
 	"net/http"
 	"strings"
 	"sync"
 	"testing"
 	"time"
@ -25,10 +26,14 @@ type mockRunner struct {
 	// CompletionRequest is only valid until the next call to Completion
 	llm.CompletionRequest
 	llm.CompletionResponse
 	CompletionFn func(context.Context, llm.CompletionRequest, func(llm.CompletionResponse)) error
 }
-func (m *mockRunner) Completion(_ context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
+func (m *mockRunner) Completion(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
 	m.CompletionRequest = r
 	if m.CompletionFn != nil {
 		return m.CompletionFn(ctx, r, fn)
 	}
 	fn(m.CompletionResponse)
 	return nil
 }
@ -88,9 +93,14 @@ func TestGenerateChat(t *testing.T) {
 		Model: "test",
 		Modelfile: fmt.Sprintf(`FROM %s
 		TEMPLATE """
-{{- if .System }}System: {{ .System }} {{ end }}
+{{- if .Tools }}
-{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
+{{ .Tools }}
-{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
+{{ end }}
 {{- range .Messages }}
 {{- .Role }}: {{ .Content }}
 {{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
 {{- end }}
 {{ end }}"""
 `, createBinFile(t, llm.KV{
 			"general.architecture":          "llama",
 			"llama.block_count":             uint32(1),
@ -263,7 +273,7 @@ func TestGenerateChat(t *testing.T) {
 			t.Errorf("expected status 200, got %d", w.Code)
 		}
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "User: Hello! "); diff != "" {
+		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "user: Hello!\n"); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
 		}
@ -292,7 +302,7 @@ func TestGenerateChat(t *testing.T) {
 			t.Errorf("expected status 200, got %d", w.Code)
 		}
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! "); diff != "" {
+		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "system: You are a helpful assistant.\nuser: Hello!\n"); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
 		}
@ -314,7 +324,7 @@ func TestGenerateChat(t *testing.T) {
 			t.Errorf("expected status 200, got %d", w.Code)
 		}
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You can perform magic tricks. User: Hello! "); diff != "" {
+		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "system: You can perform magic tricks.\nuser: Hello!\n"); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
 		}
@ -337,12 +347,242 @@ func TestGenerateChat(t *testing.T) {
 			t.Errorf("expected status 200, got %d", w.Code)
 		}
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! Assistant: I can help you with that. System: You can perform magic tricks. User: Help me write tests. "); diff != "" {
+		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "system: You are a helpful assistant.\nuser: Hello!\nassistant: I can help you with that.\nsystem: You can perform magic tricks.\nuser: Help me write tests.\n"); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
 		}
 		checkChatResponse(t, w.Body, "test-system", "Abra kadabra!")
 	})
 	t.Run("messages with tools (non-streaming)", func(t *testing.T) {
 		if w.Code != http.StatusOK {
 			t.Fatalf("failed to create test-system model: %d", w.Code)
 		}
 		tools := []api.Tool{
 			{
 				Type: "function",
 				Function: api.ToolFunction{
 					Name:        "get_weather",
 					Description: "Get the current weather",
 					Parameters: struct {
 						Type       string   `json:"type"`
 						Required   []string `json:"required"`
 						Properties map[string]struct {
 							Type        string   `json:"type"`
 							Description string   `json:"description"`
 							Enum        []string `json:"enum,omitempty"`
 						} `json:"properties"`
 					}{
 						Type:     "object",
 						Required: []string{"location"},
 						Properties: map[string]struct {
 							Type        string   `json:"type"`
 							Description string   `json:"description"`
 							Enum        []string `json:"enum,omitempty"`
 						}{
 							"location": {
 								Type:        "string",
 								Description: "The city and state",
 							},
 							"unit": {
 								Type: "string",
 								Enum: []string{"celsius", "fahrenheit"},
 							},
 						},
 					},
 				},
 			},
 		}
 		mock.CompletionResponse = llm.CompletionResponse{
 			Content:            `{"name":"get_weather","arguments":{"location":"Seattle, WA","unit":"celsius"}}`,
 			Done:               true,
 			DoneReason:         "done",
 			PromptEvalCount:    1,
 			PromptEvalDuration: 1,
 			EvalCount:          1,
 			EvalDuration:       1,
 		}
 		streamRequest := true
 		w := createRequest(t, s.ChatHandler, api.ChatRequest{
 			Model: "test-system",
 			Messages: []api.Message{
 				{Role: "user", Content: "What's the weather in Seattle?"},
 			},
 			Tools:  tools,
 			Stream: &streamRequest,
 		})
 		if w.Code != http.StatusOK {
 			var errResp struct {
 				Error string `json:"error"`
 			}
 			if err := json.NewDecoder(w.Body).Decode(&errResp); err != nil {
 				t.Logf("Failed to decode error response: %v", err)
 			} else {
 				t.Logf("Error response: %s", errResp.Error)
 			}
 		}
 		if w.Code != http.StatusOK {
 			t.Errorf("expected status 200, got %d", w.Code)
 		}
 		var resp api.ChatResponse
 		if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
 			t.Fatal(err)
 		}
 		if resp.Message.ToolCalls == nil {
 			t.Error("expected tool calls, got nil")
 		}
 		expectedToolCall := api.ToolCall{
 			Function: api.ToolCallFunction{
 				Name: "get_weather",
 				Arguments: api.ToolCallFunctionArguments{
 					"location": "Seattle, WA",
 					"unit":     "celsius",
 				},
 			},
 		}
 		if diff := cmp.Diff(resp.Message.ToolCalls[0], expectedToolCall); diff != "" {
 			t.Errorf("tool call mismatch (-got +want):\n%s", diff)
 		}
 	})
 	t.Run("messages with tools (streaming)", func(t *testing.T) {
 		tools := []api.Tool{
 			{
 				Type: "function",
 				Function: api.ToolFunction{
 					Name:        "get_weather",
 					Description: "Get the current weather",
 					Parameters: struct {
 						Type       string   `json:"type"`
 						Required   []string `json:"required"`
 						Properties map[string]struct {
 							Type        string   `json:"type"`
 							Description string   `json:"description"`
 							Enum        []string `json:"enum,omitempty"`
 						} `json:"properties"`
 					}{
 						Type:     "object",
 						Required: []string{"location"},
 						Properties: map[string]struct {
 							Type        string   `json:"type"`
 							Description string   `json:"description"`
 							Enum        []string `json:"enum,omitempty"`
 						}{
 							"location": {
 								Type:        "string",
 								Description: "The city and state",
 							},
 							"unit": {
 								Type: "string",
 								Enum: []string{"celsius", "fahrenheit"},
 							},
 						},
 					},
 				},
 			},
 		}
 		// Simulate streaming response with multiple chunks
 		var wg sync.WaitGroup
 		wg.Add(1)
 		mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
 			defer wg.Done()
 			// Send chunks with small delays to simulate streaming
 			responses := []llm.CompletionResponse{
 				{
 					Content:            `{"name":"get_`,
 					Done:               false,
 					PromptEvalCount:    1,
 					PromptEvalDuration: 1,
 				},
 				{
 					Content:            `weather","arguments":{"location":"Seattle`,
 					Done:               false,
 					PromptEvalCount:    2,
 					PromptEvalDuration: 1,
 				},
 				{
 					Content:            `, WA","unit":"celsius"}}`,
 					Done:               true,
 					DoneReason:         "tool_call",
 					PromptEvalCount:    3,
 					PromptEvalDuration: 1,
 				},
 			}
 			for _, resp := range responses {
 				select {
 				case <-ctx.Done():
 					return ctx.Err()
 				default:
 					fn(resp)
 					time.Sleep(10 * time.Millisecond) // Small delay between chunks
 				}
 			}
 			return nil
 		}
 		w := createRequest(t, s.ChatHandler, api.ChatRequest{
 			Model: "test-system",
 			Messages: []api.Message{
 				{Role: "user", Content: "What's the weather in Seattle?"},
 			},
 			Tools:  tools,
 			Stream: &stream,
 		})
 		wg.Wait()
 		if w.Code != http.StatusOK {
 			t.Errorf("expected status 200, got %d", w.Code)
 		}
 		// Read and validate the streamed responses
 		decoder := json.NewDecoder(w.Body)
 		var finalToolCall api.ToolCall
 		for {
 			var resp api.ChatResponse
 			if err := decoder.Decode(&resp); err == io.EOF {
 				break
 			} else if err != nil {
 				t.Fatal(err)
 			}
 			if resp.Done {
 				if len(resp.Message.ToolCalls) != 1 {
 					t.Errorf("expected 1 tool call in final response, got %d", len(resp.Message.ToolCalls))
 				}
 				finalToolCall = resp.Message.ToolCalls[0]
 			}
 		}
 		expectedToolCall := api.ToolCall{
 			Function: api.ToolCallFunction{
 				Name: "get_weather",
 				Arguments: api.ToolCallFunctionArguments{
 					"location": "Seattle, WA",
 					"unit":     "celsius",
 				},
 			},
 		}
 		if diff := cmp.Diff(finalToolCall, expectedToolCall); diff != "" {
 			t.Errorf("final tool call mismatch (-got +want):\n%s", diff)
 		}
 	})
 }
 func TestGenerate(t *testing.T) {
--- a/server/routes_test.go
+++ b/server/routes_test.go
@ -7,13 +7,18 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
 	"io/fs"
 	"math"
 	"math/rand/v2"
 	"net"
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"sort"
 	"strings"
 	"testing"
 	"unicode"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
@ -473,83 +478,129 @@ func Test_Routes(t *testing.T) {
 	}
 }
-func TestCase(t *testing.T) {
+func casingShuffle(s string) string {
 	rr := []rune(s)
 	for i := range rr {
 		if rand.N(2) == 0 {
 			rr[i] = unicode.ToUpper(rr[i])
 		} else {
 			rr[i] = unicode.ToLower(rr[i])
 		}
 	}
 	return string(rr)
 }
 func TestManifestCaseSensitivity(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", t.TempDir())
-	cases := []string{
+	r := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		"mistral",
+		w.WriteHeader(http.StatusOK)
-		"llama3:latest",
+		io.WriteString(w, `{}`) //nolint:errcheck
-		"library/phi3:q4_0",
+	}))
-		"registry.ollama.ai/library/gemma:q5_K_M",
+	defer r.Close()
-		// TODO: host:port currently fails on windows (#4107)
+
-		// "localhost:5000/alice/bob:latest",
+	nameUsed := make(map[string]bool)
 	name := func() string {
 		const fqmn = "example/namespace/model:tag"
 		for {
 			v := casingShuffle(fqmn)
 			if nameUsed[v] {
 				continue
 			}
 			nameUsed[v] = true
 			return v
 		}
 	}
 	wantStableName := name()
 	// checkManifestList tests that there is strictly one manifest in the
 	// models directory, and that the manifest is for the model under test.
 	checkManifestList := func() {
 		t.Helper()
 		mandir := filepath.Join(os.Getenv("OLLAMA_MODELS"), "manifests/")
 		var entries []string
 		t.Logf("dir entries:")
 		fsys := os.DirFS(mandir)
 		err := fs.WalkDir(fsys, ".", func(path string, info fs.DirEntry, err error) error {
 			if err != nil {
 				return err
 			}
 			t.Logf("    %s", fs.FormatDirEntry(info))
 			if info.IsDir() {
 				return nil
 			}
 			path = strings.TrimPrefix(path, mandir)
 			entries = append(entries, path)
 			return nil
 		})
 		if err != nil {
 			t.Fatalf("failed to walk directory: %v", err)
 		}
 		if len(entries) != 1 {
 			t.Errorf("len(got) = %d, want 1", len(entries))
 			return // do not use Fatal so following steps run
 		}
 		g := entries[0] // raw path
 		g = filepath.ToSlash(g)
 		w := model.ParseName(wantStableName).Filepath()
 		w = filepath.ToSlash(w)
 		if g != w {
 			t.Errorf("\ngot:  %s\nwant: %s", g, w)
 		}
 	}
 	checkOK := func(w *httptest.ResponseRecorder) {
 		t.Helper()
 		if w.Code != http.StatusOK {
 			t.Errorf("code = %d, want 200", w.Code)
 			t.Logf("body: %s", w.Body.String())
 		}
 	}
 	var s Server
-	for _, tt := range cases {
+	testMakeRequestDialContext = func(ctx context.Context, _, _ string) (net.Conn, error) {
-		t.Run(tt, func(t *testing.T) {
+		var d net.Dialer
-			w := createRequest(t, s.CreateHandler, api.CreateRequest{
+		return d.DialContext(ctx, "tcp", r.Listener.Addr().String())
 				Name:      tt,
 				Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 				Stream:    &stream,
 			})
 			if w.Code != http.StatusOK {
 				t.Fatalf("expected status 200 got %d", w.Code)
 			}
 			expect, err := json.Marshal(map[string]string{"error": "a model with that name already exists"})
 			if err != nil {
 				t.Fatal(err)
 			}
 			t.Run("create", func(t *testing.T) {
 				w = createRequest(t, s.CreateHandler, api.CreateRequest{
 					Name:      strings.ToUpper(tt),
 					Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 					Stream:    &stream,
 				})
 				if w.Code != http.StatusBadRequest {
 					t.Fatalf("expected status 500 got %d", w.Code)
 				}
 				if !bytes.Equal(w.Body.Bytes(), expect) {
 					t.Fatalf("expected error %s got %s", expect, w.Body.String())
 				}
 			})
 			t.Run("pull", func(t *testing.T) {
 				w := createRequest(t, s.PullHandler, api.PullRequest{
 					Name:   strings.ToUpper(tt),
 					Stream: &stream,
 				})
 				if w.Code != http.StatusBadRequest {
 					t.Fatalf("expected status 500 got %d", w.Code)
 				}
 				if !bytes.Equal(w.Body.Bytes(), expect) {
 					t.Fatalf("expected error %s got %s", expect, w.Body.String())
 				}
 			})
 			t.Run("copy", func(t *testing.T) {
 				w := createRequest(t, s.CopyHandler, api.CopyRequest{
 					Source:      tt,
 					Destination: strings.ToUpper(tt),
 				})
 				if w.Code != http.StatusBadRequest {
 					t.Fatalf("expected status 500 got %d", w.Code)
 				}
 				if !bytes.Equal(w.Body.Bytes(), expect) {
 					t.Fatalf("expected error %s got %s", expect, w.Body.String())
 				}
 			})
 		})
 	}
 	t.Cleanup(func() { testMakeRequestDialContext = nil })
 	t.Logf("creating")
 	checkOK(createRequest(t, s.CreateHandler, api.CreateRequest{
 		// Start with the stable name, and later use a case-shuffled
 		// version.
 		Name: wantStableName,
 		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		Stream:    &stream,
 	}))
 	checkManifestList()
 	t.Logf("creating (again)")
 	checkOK(createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:      name(),
 		Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, nil, nil)),
 		Stream:    &stream,
 	}))
 	checkManifestList()
 	t.Logf("pulling")
 	checkOK(createRequest(t, s.PullHandler, api.PullRequest{
 		Name:     name(),
 		Stream:   &stream,
 		Insecure: true,
 	}))
 	checkManifestList()
 	t.Logf("copying")
 	checkOK(createRequest(t, s.CopyHandler, api.CopyRequest{
 		Source:      name(),
 		Destination: name(),
 	}))
 	checkManifestList()
 }
 func TestShow(t *testing.T) {
--- a/types/model/name.go
+++ b/types/model/name.go
@ -298,6 +298,13 @@ func (n Name) LogValue() slog.Value {
 	return slog.StringValue(n.String())
 }
 func (n Name) EqualFold(o Name) bool {
 	return strings.EqualFold(n.Host, o.Host) &&
 		strings.EqualFold(n.Namespace, o.Namespace) &&
 		strings.EqualFold(n.Model, o.Model) &&
 		strings.EqualFold(n.Tag, o.Tag)
 }
 func isValidLen(kind partKind, s string) bool {
 	switch kind {
 	case kindHost: