31 changed files with 445 additions and 1830 deletions
--- a/README.md
+++ b/README.md
@ -294,8 +294,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
- [AI Studio](https://github.com/MindWorkAI/AI-Studio)
- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)

 ### Terminal

--- a/api/types.go
+++ b/api/types.go
@ -47,9 +47,6 @@ type GenerateRequest struct {
 	// Prompt is the textual prompt to send to the model.
 	Prompt string `json:"prompt"`

-	// Suffix is the text that comes after the inserted text.
-	Suffix string `json:"suffix"`
-
 	// System overrides the model's default system message/prompt.
 	System string `json:"system"`

@ -101,29 +98,48 @@ type ChatRequest struct {
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

 	// Tools is an optional list of tools the model has access to.
-	Tools `json:"tools,omitempty"`
+	Tools []Tool `json:"tools,omitempty"`

 	// Options lists model-specific options.
 	Options map[string]interface{} `json:"options"`
 }

-type Tools []Tool
-
-func (t Tools) String() string {
-	bts, _ := json.Marshal(t)
-	return string(bts)
-}
-
 // Message is a single message in a chat sequence. The message contains the
 // role ("system", "user", or "assistant"), the content and an optional list
 // of images.
 type Message struct {
 	Role      string      `json:"role"`
-	Content   string      `json:"content"`
+	Content   string      `json:"content,omitempty"`
 	Images    []ImageData `json:"images,omitempty"`
 	ToolCalls []ToolCall  `json:"tool_calls,omitempty"`
 }

+type ToolCall struct {
+	ID       string `json:"id"`
+	Type     string `json:"type"`
+	Function struct {
+		Name      string         `json:"name"`
+		Arguments map[string]any `json:"arguments"`
+	} `json:"function"`
+}
+
+type Tool struct {
+	Type     string `json:"type"`
+	Function struct {
+		Name        string `json:"name"`
+		Description string `json:"description"`
+		Parameters  struct {
+			Type       string   `json:"type"`
+			Required   []string `json:"required"`
+			Properties map[string]struct {
+				Type        string   `json:"type"`
+				Description string   `json:"description"`
+				Enum        []string `json:"enum,omitempty"`
+			} `json:"properties"`
+		} `json:"parameters"`
+	} `json:"function"`
+}
+
 func (m *Message) UnmarshalJSON(b []byte) error {
 	type Alias Message
 	var a Alias
@ -136,46 +152,6 @@ func (m *Message) UnmarshalJSON(b []byte) error {
 	return nil
 }

-type ToolCall struct {
-	Function ToolCallFunction `json:"function"`
-}
-
-type ToolCallFunction struct {
-	Name      string                    `json:"name"`
-	Arguments ToolCallFunctionArguments `json:"arguments"`
-}
-
-type ToolCallFunctionArguments map[string]any
-
-func (t *ToolCallFunctionArguments) String() string {
-	bts, _ := json.Marshal(t)
-	return string(bts)
-}
-
-type Tool struct {
-	Type     string       `json:"type"`
-	Function ToolFunction `json:"function"`
-}
-
-type ToolFunction struct {
-	Name        string `json:"name"`
-	Description string `json:"description"`
-	Parameters  struct {
-		Type       string   `json:"type"`
-		Required   []string `json:"required"`
-		Properties map[string]struct {
-			Type        string   `json:"type"`
-			Description string   `json:"description"`
-			Enum        []string `json:"enum,omitempty"`
-		} `json:"properties"`
-	} `json:"parameters"`
-}
-
-func (t *ToolFunction) String() string {
-	bts, _ := json.Marshal(t)
-	return string(bts)
-}
-
 // ChatResponse is the response returned by [Client.Chat]. Its fields are
 // similar to [GenerateResponse].
 type ChatResponse struct {
@ -428,6 +404,9 @@ type GenerateResponse struct {
 	// Response is the textual response itself.
 	Response string `json:"response"`

+	// ToolCalls is the list of tools the model wants to call
+	ToolCalls []ToolCall `json:"tool_calls,omitempty"`
+
 	// Done specifies if the response is complete.
 	Done bool `json:"done"`

--- a/docs/gpu.md
+++ b/docs/gpu.md
@ -46,24 +46,13 @@ sudo modprobe nvidia_uvm`

 ## AMD Radeon
 Ollama supports the following AMD GPUs:
-
-### Linux Support
 | Family         | Cards and accelerators                                                                                                               |
 | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
 | AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56`    |
 | AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `VII` `SSG` |
 | AMD Instinct   | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50`                                                               |

-### Windows Support
-With ROCm v6.1, the following GPUs are supported on Windows.
-
-| Family         | Cards and accelerators                                                                                                               |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800`    |
-| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` |
-
-
-### Overrides on Linux
+### Overrides
 Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
 some cases you can force the system to try to use a similar LLVM target that is
 close.  For example The Radeon RX 5400 is `gfx1034` (also known as 10.3.4)
@ -74,7 +63,7 @@ would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
 server.  If you have an unsupported AMD GPU you can experiment using the list of
 supported types below.

-At this time, the known supported GPU types on linux are the following LLVM Targets.
+At this time, the known supported GPU types are the following LLVM Targets.
 This table shows some example GPUs that map to these LLVM targets:
 | **LLVM Target** | **An Example GPU** |
 |-----------------|---------------------|
--- a/docs/openai.md
+++ b/docs/openai.md
@ -103,6 +103,10 @@ curl http://localhost:11434/v1/chat/completions \
 - [ ] `user`
 - [ ] `n`

+#### Notes
+
+- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
+
 ## Models

 Before using a model, pull it locally `ollama pull`:
--- a/gpu/amd_hip_windows.go
+++ b/gpu/amd_hip_windows.go
@ -33,10 +33,9 @@ type HipLib struct {
 }

 func NewHipLib() (*HipLib, error) {
-	// At runtime we depend on v6, so discover GPUs with the same library for a consistent set of GPUs
-	h, err := windows.LoadLibrary("amdhip64_6.dll")
+	h, err := windows.LoadLibrary("amdhip64.dll")
 	if err != nil {
-		return nil, fmt.Errorf("unable to load amdhip64_6.dll, please make sure to upgrade to the latest amd driver: %w", err)
+		return nil, fmt.Errorf("unable to load amdhip64.dll: %w", err)
 	}
 	hl := &HipLib{}
 	hl.dll = h
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@ -92,8 +92,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 			continue
 		}
 		if gfxOverride == "" {
-			// Strip off Target Features when comparing
-			if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
+			if !slices.Contains[[]string, string](supported, gfx) {
 				slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
--- a/integration/context_test.go
+++ b/integration/context_test.go
@ -12,7 +12,7 @@ import (

 func TestContextExhaustion(t *testing.T) {
 	// Longer needed for small footprint GPUs
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute)
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
@ -25,10 +25,5 @@ func TestContextExhaustion(t *testing.T) {
 			"num_ctx":     128,
 		},
 	}
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-	if err := PullIfMissing(ctx, client, req.Model); err != nil {
-		t.Fatalf("PullIfMissing failed: %v", err)
-	}
-	DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived"}, 120*time.Second, 10*time.Second)
+	GenerateTestHelper(ctx, t, req, []string{"once", "upon", "lived"})
 }
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -7,8 +7,8 @@ function amdGPUs {
        return $env:AMDGPU_TARGETS
    }
    # Current supported rocblas list from ROCm v6.1.2 on windows
-    # https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus
    $GPU_LIST = @(
+        "gfx906:xnack-"
        "gfx1030"
        "gfx1100"
        "gfx1101"
--- a/llm/gguf.go
+++ b/llm/gguf.go
@ -537,7 +537,6 @@ var ggufKVOrder = map[string][]string{
 		"tokenizer.ggml.add_bos_token",
 		"tokenizer.ggml.add_eos_token",
 		"tokenizer.chat_template",
-		"bert.pooling_type",
 	},
 }

--- a/llm/patches/10-tekken.diff
+++ b/llm/patches/10-tekken.diff
@ -1,43 +0,0 @@
-diff --git a/include/llama.h b/include/llama.h
-index bb4b05ba..a92174e0 100644
--- a/include/llama.h
-+++ b/include/llama.h
-@@ -92,6 +92,7 @@ extern "C" {
-         LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
-         LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
-         LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
-+        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
-     };
- 
-     // note: these values should be synchronized with ggml_rope
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 18364976..435b6fe5 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -5429,6 +5429,12 @@ static void llm_load_vocab(
-             } else if (
-                 tokenizer_pre == "jais") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
-+            } else if (
-+                tokenizer_pre == "tekken") {
-+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
-+                vocab.tokenizer_clean_spaces = false;
-+                vocab.tokenizer_ignore_merges = true;
-+                vocab.tokenizer_add_bos = true;
-             } else {
-                 LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe {
-                     " ?[^(\\s|.,!?…。，、।۔،)]+",
-                 };
-                 break;
-+            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
-+                    // original regex from tokenizer.json
-+                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-+                regex_exprs = {
-+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-+                };
-+                break;
-             default:
-                 // default regex for BPE tokenization pre-processing
-                 regex_exprs = {
--- a/llm/patches/11-embd_kv.diff
+++ b/llm/patches/11-embd_kv.diff
@ -1,19 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 2b9ace28..e60d3d8d 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -6052,10 +6052,10 @@ static bool llm_load_tensors(
- 
-                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd,  n_embd_head_k * n_head});
-+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
- 
-                         // optional bias tensors
-                         layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
--- a/llm/server.go
+++ b/llm/server.go
@ -385,10 +385,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			filteredEnv := []string{}
 			for _, ev := range s.cmd.Env {
 				if strings.HasPrefix(ev, "CUDA_") ||
-					strings.HasPrefix(ev, "ROCR_") ||
 					strings.HasPrefix(ev, "ROCM_") ||
 					strings.HasPrefix(ev, "HIP_") ||
-					strings.HasPrefix(ev, "GPU_") ||
 					strings.HasPrefix(ev, "HSA_") ||
 					strings.HasPrefix(ev, "GGML_") ||
 					strings.HasPrefix(ev, "PATH=") ||
--- a/openai/openai.go
+++ b/openai/openai.go
@ -7,7 +7,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
-	"log/slog"
 	"math/rand"
 	"net/http"
 	"strings"
@ -30,9 +29,8 @@ type ErrorResponse struct {
 }

 type Message struct {
-	Role      string     `json:"role"`
-	Content   any        `json:"content"`
-	ToolCalls []ToolCall `json:"tool_calls,omitempty"`
+	Role    string `json:"role"`
+	Content any    `json:"content"`
 }

 type Choice struct {
@ -63,11 +61,6 @@ type ResponseFormat struct {
 	Type string `json:"type"`
 }

-type EmbedRequest struct {
-	Input any    `json:"input"`
-	Model string `json:"model"`
-}
-
 type ChatCompletionRequest struct {
 	Model            string          `json:"model"`
 	Messages         []Message       `json:"messages"`
@ -80,7 +73,6 @@ type ChatCompletionRequest struct {
 	PresencePenalty  *float64        `json:"presence_penalty_penalty"`
 	TopP             *float64        `json:"top_p"`
 	ResponseFormat   *ResponseFormat `json:"response_format"`
-	Tools            []api.Tool      `json:"tools"`
 }

 type ChatCompletion struct {
@ -114,7 +106,6 @@ type CompletionRequest struct {
 	Stream           bool     `json:"stream"`
 	Temperature      *float32 `json:"temperature"`
 	TopP             float32  `json:"top_p"`
-	Suffix           string   `json:"suffix"`
 }

 type Completion struct {
@ -136,15 +127,6 @@ type CompletionChunk struct {
 	SystemFingerprint string                `json:"system_fingerprint"`
 }

-type ToolCall struct {
-	ID       string `json:"id"`
-	Type     string `json:"type"`
-	Function struct {
-		Name      string `json:"name"`
-		Arguments string `json:"arguments"`
-	} `json:"function"`
-}
-
 type Model struct {
 	Id      string `json:"id"`
 	Object  string `json:"object"`
@ -152,23 +134,11 @@ type Model struct {
 	OwnedBy string `json:"owned_by"`
 }

-type Embedding struct {
-	Object    string    `json:"object"`
-	Embedding []float32 `json:"embedding"`
-	Index     int       `json:"index"`
-}
-
 type ListCompletion struct {
 	Object string  `json:"object"`
 	Data   []Model `json:"data"`
 }

-type EmbeddingList struct {
-	Object string      `json:"object"`
-	Data   []Embedding `json:"data"`
-	Model  string      `json:"model"`
-}
-
 func NewError(code int, message string) ErrorResponse {
 	var etype string
 	switch code {
@ -183,31 +153,7 @@ func NewError(code int, message string) ErrorResponse {
 	return ErrorResponse{Error{Type: etype, Message: message}}
 }

-func toolCallId() string {
-	const letterBytes = "abcdefghijklmnopqrstuvwxyz0123456789"
-	b := make([]byte, 8)
-	for i := range b {
-		b[i] = letterBytes[rand.Intn(len(letterBytes))]
-	}
-	return "call_" + strings.ToLower(string(b))
-}
-
 func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
-	toolCalls := make([]ToolCall, len(r.Message.ToolCalls))
-	for i, tc := range r.Message.ToolCalls {
-		toolCalls[i].ID = toolCallId()
-		toolCalls[i].Type = "function"
-		toolCalls[i].Function.Name = tc.Function.Name
-
-		args, err := json.Marshal(tc.Function.Arguments)
-		if err != nil {
-			slog.Error("could not marshall function arguments to json", "error", err)
-			continue
-		}
-
-		toolCalls[i].Function.Arguments = string(args)
-	}
-
 	return ChatCompletion{
 		Id:                id,
 		Object:            "chat.completion",
@ -216,7 +162,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 		SystemFingerprint: "fp_ollama",
 		Choices: []Choice{{
 			Index:   0,
-			Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls},
+			Message: Message{Role: r.Message.Role, Content: r.Message.Content},
 			FinishReason: func(reason string) *string {
 				if len(reason) > 0 {
 					return &reason
@ -225,6 +171,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 			}(r.DoneReason),
 		}},
 		Usage: Usage{
+			// TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count
 			PromptTokens:     r.PromptEvalCount,
 			CompletionTokens: r.EvalCount,
 			TotalTokens:      r.PromptEvalCount + r.EvalCount,
@ -270,6 +217,7 @@ func toCompletion(id string, r api.GenerateResponse) Completion {
 			}(r.DoneReason),
 		}},
 		Usage: Usage{
+			// TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count
 			PromptTokens:     r.PromptEvalCount,
 			CompletionTokens: r.EvalCount,
 			TotalTokens:      r.PromptEvalCount + r.EvalCount,
@ -314,27 +262,6 @@ func toListCompletion(r api.ListResponse) ListCompletion {
 	}
 }

-func toEmbeddingList(model string, r api.EmbedResponse) EmbeddingList {
-	if r.Embeddings != nil {
-		var data []Embedding
-		for i, e := range r.Embeddings {
-			data = append(data, Embedding{
-				Object:    "embedding",
-				Embedding: e,
-				Index:     i,
-			})
-		}
-
-		return EmbeddingList{
-			Object: "list",
-			Data:   data,
-			Model:  model,
-		}
-	}
-
-	return EmbeddingList{}
-}
-
 func toModel(r api.ShowResponse, m string) Model {
 	return Model{
 		Id:      m,
@ -351,6 +278,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 		case string:
 			messages = append(messages, api.Message{Role: msg.Role, Content: content})
 		case []any:
+			message := api.Message{Role: msg.Role}
 			for _, c := range content {
 				data, ok := c.(map[string]any)
 				if !ok {
@ -362,7 +290,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 					if !ok {
 						return nil, fmt.Errorf("invalid message format")
 					}
-					messages = append(messages, api.Message{Role: msg.Role, Content: text})
+					message.Content = text
 				case "image_url":
 					var url string
 					if urlMap, ok := data["image_url"].(map[string]any); ok {
@ -394,26 +322,14 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 					if err != nil {
 						return nil, fmt.Errorf("invalid message format")
 					}
-
-					messages = append(messages, api.Message{Role: msg.Role, Images: []api.ImageData{img}})
+					message.Images = append(message.Images, img)
 				default:
 					return nil, fmt.Errorf("invalid message format")
 				}
 			}
+			messages = append(messages, message)
 		default:
-			if msg.ToolCalls == nil {
-				return nil, fmt.Errorf("invalid message content type: %T", content)
-			}
-
-			toolCalls := make([]api.ToolCall, len(msg.ToolCalls))
-			for i, tc := range msg.ToolCalls {
-				toolCalls[i].Function.Name = tc.Function.Name
-				err := json.Unmarshal([]byte(tc.Function.Arguments), &toolCalls[i].Function.Arguments)
-				if err != nil {
-					return nil, fmt.Errorf("invalid tool call arguments")
-				}
-			}
-			messages = append(messages, api.Message{Role: msg.Role, ToolCalls: toolCalls})
+			return nil, fmt.Errorf("invalid message content type: %T", content)
 		}
 	}

@ -471,7 +387,6 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 		Format:   format,
 		Options:  options,
 		Stream:   &r.Stream,
-		Tools:    r.Tools,
 	}, nil
 }

@ -522,7 +437,6 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 		Prompt:  r.Prompt,
 		Options: options,
 		Stream:  &r.Stream,
-		Suffix:  r.Suffix,
 	}, nil
 }

@ -551,11 +465,6 @@ type RetrieveWriter struct {
 	model string
 }

-type EmbedWriter struct {
-	BaseWriter
-	model string
-}
-
 func (w *BaseWriter) writeError(code int, data []byte) (int, error) {
 	var serr api.StatusError
 	err := json.Unmarshal(data, &serr)
@ -721,33 +630,6 @@ func (w *RetrieveWriter) Write(data []byte) (int, error) {
 	return w.writeResponse(data)
 }

-func (w *EmbedWriter) writeResponse(data []byte) (int, error) {
-	var embedResponse api.EmbedResponse
-	err := json.Unmarshal(data, &embedResponse)
-
-	if err != nil {
-		return 0, err
-	}
-
-	w.ResponseWriter.Header().Set("Content-Type", "application/json")
-	err = json.NewEncoder(w.ResponseWriter).Encode(toEmbeddingList(w.model, embedResponse))
-
-	if err != nil {
-		return 0, err
-	}
-
-	return len(data), nil
-}
-
-func (w *EmbedWriter) Write(data []byte) (int, error) {
-	code := w.ResponseWriter.Status()
-	if code != http.StatusOK {
-		return w.writeError(code, data)
-	}
-
-	return w.writeResponse(data)
-}
-
 func ListMiddleware() gin.HandlerFunc {
 	return func(c *gin.Context) {
 		w := &ListWriter{
@ -811,47 +693,6 @@ func CompletionsMiddleware() gin.HandlerFunc {
 			id:         fmt.Sprintf("cmpl-%d", rand.Intn(999)),
 		}

-		c.Writer = w
-		c.Next()
-	}
-}
-
-func EmbeddingsMiddleware() gin.HandlerFunc {
-	return func(c *gin.Context) {
-		var req EmbedRequest
-		err := c.ShouldBindJSON(&req)
-		if err != nil {
-			c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error()))
-			return
-		}
-
-		if req.Input == "" {
-			req.Input = []string{""}
-		}
-
-		if req.Input == nil {
-			c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, "invalid input"))
-			return
-		}
-
-		if v, ok := req.Input.([]any); ok && len(v) == 0 {
-			c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, "invalid input"))
-			return
-		}
-
-		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(api.EmbedRequest{Model: req.Model, Input: req.Input}); err != nil {
-			c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error()))
-			return
-		}
-
-		c.Request.Body = io.NopCloser(&b)
-
-		w := &EmbedWriter{
-			BaseWriter: BaseWriter{ResponseWriter: c.Writer},
-			model:      req.Model,
-		}
-
 		c.Writer = w

 		c.Next()
@ -877,7 +718,6 @@ func ChatMiddleware() gin.HandlerFunc {
 		chatReq, err := fromChatRequest(req)
 		if err != nil {
 			c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error()))
-			return
 		}

 		if err := json.NewEncoder(&b).Encode(chatReq); err != nil {
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@ -20,59 +20,108 @@ const prefix = `data:image/jpeg;base64,`
 const image = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
 const imageURL = prefix + image

-func prepareRequest(req *http.Request, body any) {
-	bodyBytes, _ := json.Marshal(body)
-	req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
-	req.Header.Set("Content-Type", "application/json")
-}
-
-func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc {
-	return func(c *gin.Context) {
-		bodyBytes, _ := io.ReadAll(c.Request.Body)
-		c.Request.Body = io.NopCloser(bytes.NewReader(bodyBytes))
-		err := json.Unmarshal(bodyBytes, capturedRequest)
-		if err != nil {
-			c.AbortWithStatusJSON(http.StatusInternalServerError, "failed to unmarshal request")
-		}
-		c.Next()
-	}
-}
-
-func TestChatMiddleware(t *testing.T) {
+func TestMiddlewareRequests(t *testing.T) {
 	type testCase struct {
 		Name     string
+		Method   string
+		Path     string
+		Handler  func() gin.HandlerFunc
 		Setup    func(t *testing.T, req *http.Request)
-		Expected func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder)
+		Expected func(t *testing.T, req *http.Request)
 	}

-	var capturedRequest *api.ChatRequest
+	var capturedRequest *http.Request
+
+	captureRequestMiddleware := func() gin.HandlerFunc {
+		return func(c *gin.Context) {
+			bodyBytes, _ := io.ReadAll(c.Request.Body)
+			c.Request.Body = io.NopCloser(bytes.NewReader(bodyBytes))
+			capturedRequest = c.Request
+			c.Next()
+		}
+	}

 	testCases := []testCase{
 		{
-			Name: "chat handler",
+			Name:    "chat handler",
+			Method:  http.MethodPost,
+			Path:    "/api/chat",
+			Handler: ChatMiddleware,
 			Setup: func(t *testing.T, req *http.Request) {
 				body := ChatCompletionRequest{
 					Model:    "test-model",
 					Messages: []Message{{Role: "user", Content: "Hello"}},
 				}
-				prepareRequest(req, body)
+
+				bodyBytes, _ := json.Marshal(body)
+
+				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
+				req.Header.Set("Content-Type", "application/json")
 			},
-			Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
-				if resp.Code != http.StatusOK {
-					t.Fatalf("expected 200, got %d", resp.Code)
+			Expected: func(t *testing.T, req *http.Request) {
+				var chatReq api.ChatRequest
+				if err := json.NewDecoder(req.Body).Decode(&chatReq); err != nil {
+					t.Fatal(err)
 				}

-				if req.Messages[0].Role != "user" {
-					t.Fatalf("expected 'user', got %s", req.Messages[0].Role)
+				if chatReq.Messages[0].Role != "user" {
+					t.Fatalf("expected 'user', got %s", chatReq.Messages[0].Role)
 				}

-				if req.Messages[0].Content != "Hello" {
-					t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content)
+				if chatReq.Messages[0].Content != "Hello" {
+					t.Fatalf("expected 'Hello', got %s", chatReq.Messages[0].Content)
 				}
 			},
 		},
 		{
-			Name: "chat handler with image content",
+			Name:    "completions handler",
+			Method:  http.MethodPost,
+			Path:    "/api/generate",
+			Handler: CompletionsMiddleware,
+			Setup: func(t *testing.T, req *http.Request) {
+				temp := float32(0.8)
+				body := CompletionRequest{
+					Model:       "test-model",
+					Prompt:      "Hello",
+					Temperature: &temp,
+					Stop:        []string{"\n", "stop"},
+				}
+
+				bodyBytes, _ := json.Marshal(body)
+
+				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
+				req.Header.Set("Content-Type", "application/json")
+			},
+			Expected: func(t *testing.T, req *http.Request) {
+				var genReq api.GenerateRequest
+				if err := json.NewDecoder(req.Body).Decode(&genReq); err != nil {
+					t.Fatal(err)
+				}
+
+				if genReq.Prompt != "Hello" {
+					t.Fatalf("expected 'Hello', got %s", genReq.Prompt)
+				}
+
+				if genReq.Options["temperature"] != 1.6 {
+					t.Fatalf("expected 1.6, got %f", genReq.Options["temperature"])
+				}
+
+				stopTokens, ok := genReq.Options["stop"].([]any)
+
+				if !ok {
+					t.Fatalf("expected stop tokens to be a list")
+				}
+
+				if stopTokens[0] != "\n" || stopTokens[1] != "stop" {
+					t.Fatalf("expected ['\\n', 'stop'], got %v", stopTokens)
+				}
+			},
+		},
+		{
+			Name:    "chat handler with image content",
+			Method:  http.MethodPost,
+			Path:    "/api/chat",
+			Handler: ChatMiddleware,
 			Setup: func(t *testing.T, req *http.Request) {
 				body := ChatCompletionRequest{
 					Model: "test-model",
@ -85,313 +134,58 @@ func TestChatMiddleware(t *testing.T) {
 						},
 					},
 				}
-				prepareRequest(req, body)
+
+				bodyBytes, _ := json.Marshal(body)
+
+				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
+				req.Header.Set("Content-Type", "application/json")
 			},
-			Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
-				if resp.Code != http.StatusOK {
-					t.Fatalf("expected 200, got %d", resp.Code)
+			Expected: func(t *testing.T, req *http.Request) {
+				var chatReq api.ChatRequest
+				if err := json.NewDecoder(req.Body).Decode(&chatReq); err != nil {
+					t.Fatal(err)
 				}

-				if req.Messages[0].Role != "user" {
-					t.Fatalf("expected 'user', got %s", req.Messages[0].Role)
+				if chatReq.Messages[0].Role != "user" {
+					t.Fatalf("expected 'user', got %s", chatReq.Messages[0].Role)
 				}

-				if req.Messages[0].Content != "Hello" {
-					t.Fatalf("expected 'Hello', got %s", req.Messages[0].Content)
+				if chatReq.Messages[0].Content != "Hello" {
+					t.Fatalf("expected 'Hello', got %s", chatReq.Messages[0].Content)
 				}

 				img, _ := base64.StdEncoding.DecodeString(imageURL[len(prefix):])

-				if req.Messages[1].Role != "user" {
-					t.Fatalf("expected 'user', got %s", req.Messages[1].Role)
-				}
-
-				if !bytes.Equal(req.Messages[1].Images[0], img) {
-					t.Fatalf("expected image encoding, got %s", req.Messages[1].Images[0])
-				}
-			},
-		},
-		{
-			Name: "chat handler with tools",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := ChatCompletionRequest{
-					Model: "test-model",
-					Messages: []Message{
-						{Role: "user", Content: "What's the weather like in Paris Today?"},
-						{Role: "assistant", ToolCalls: []ToolCall{{
-							ID:   "id",
-							Type: "function",
-							Function: struct {
-								Name      string `json:"name"`
-								Arguments string `json:"arguments"`
-							}{
-								Name:      "get_current_weather",
-								Arguments: "{\"location\": \"Paris, France\", \"format\": \"celsius\"}",
-							},
-						}}},
-					},
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
-				if resp.Code != 200 {
-					t.Fatalf("expected 200, got %d", resp.Code)
-				}
-
-				if req.Messages[0].Content != "What's the weather like in Paris Today?" {
-					t.Fatalf("expected What's the weather like in Paris Today?, got %s", req.Messages[0].Content)
-				}
-
-				if req.Messages[1].ToolCalls[0].Function.Arguments["location"] != "Paris, France" {
-					t.Fatalf("expected 'Paris, France', got %v", req.Messages[1].ToolCalls[0].Function.Arguments["location"])
-				}
-
-				if req.Messages[1].ToolCalls[0].Function.Arguments["format"] != "celsius" {
-					t.Fatalf("expected celsius, got %v", req.Messages[1].ToolCalls[0].Function.Arguments["format"])
-				}
-			},
-		},
-		{
-			Name: "chat handler error forwarding",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := ChatCompletionRequest{
-					Model:    "test-model",
-					Messages: []Message{{Role: "user", Content: 2}},
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.ChatRequest, resp *httptest.ResponseRecorder) {
-				if resp.Code != http.StatusBadRequest {
-					t.Fatalf("expected 400, got %d", resp.Code)
-				}
-
-				if !strings.Contains(resp.Body.String(), "invalid message content type") {
-					t.Fatalf("error was not forwarded")
+				if !bytes.Equal(chatReq.Messages[0].Images[0], img) {
+					t.Fatalf("expected image encoding, got %s", chatReq.Messages[0].Images[0])
 				}
 			},
 		},
 	}

+	gin.SetMode(gin.TestMode)
+	router := gin.New()
+
 	endpoint := func(c *gin.Context) {
 		c.Status(http.StatusOK)
 	}

-	gin.SetMode(gin.TestMode)
-	router := gin.New()
-	router.Use(ChatMiddleware(), captureRequestMiddleware(&capturedRequest))
-	router.Handle(http.MethodPost, "/api/chat", endpoint)
-
 	for _, tc := range testCases {
 		t.Run(tc.Name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/api/chat", nil)
+			router = gin.New()
+			router.Use(captureRequestMiddleware())
+			router.Use(tc.Handler())
+			router.Handle(tc.Method, tc.Path, endpoint)
+			req, _ := http.NewRequest(tc.Method, tc.Path, nil)

-			tc.Setup(t, req)
+			if tc.Setup != nil {
+				tc.Setup(t, req)
+			}

 			resp := httptest.NewRecorder()
 			router.ServeHTTP(resp, req)

-			tc.Expected(t, capturedRequest, resp)
-
-			capturedRequest = nil
-		})
-	}
-}
-
-func TestCompletionsMiddleware(t *testing.T) {
-	type testCase struct {
-		Name     string
-		Setup    func(t *testing.T, req *http.Request)
-		Expected func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder)
-	}
-
-	var capturedRequest *api.GenerateRequest
-
-	testCases := []testCase{
-		{
-			Name: "completions handler",
-			Setup: func(t *testing.T, req *http.Request) {
-				temp := float32(0.8)
-				body := CompletionRequest{
-					Model:       "test-model",
-					Prompt:      "Hello",
-					Temperature: &temp,
-					Stop:        []string{"\n", "stop"},
-					Suffix:      "suffix",
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) {
-				if req.Prompt != "Hello" {
-					t.Fatalf("expected 'Hello', got %s", req.Prompt)
-				}
-
-				if req.Options["temperature"] != 1.6 {
-					t.Fatalf("expected 1.6, got %f", req.Options["temperature"])
-				}
-
-				stopTokens, ok := req.Options["stop"].([]any)
-
-				if !ok {
-					t.Fatalf("expected stop tokens to be a list")
-				}
-
-				if stopTokens[0] != "\n" || stopTokens[1] != "stop" {
-					t.Fatalf("expected ['\\n', 'stop'], got %v", stopTokens)
-				}
-
-				if req.Suffix != "suffix" {
-					t.Fatalf("expected 'suffix', got %s", req.Suffix)
-				}
-			},
-		},
-		{
-			Name: "completions handler error forwarding",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := CompletionRequest{
-					Model:       "test-model",
-					Prompt:      "Hello",
-					Temperature: nil,
-					Stop:        []int{1, 2},
-					Suffix:      "suffix",
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.GenerateRequest, resp *httptest.ResponseRecorder) {
-				if resp.Code != http.StatusBadRequest {
-					t.Fatalf("expected 400, got %d", resp.Code)
-				}
-
-				if !strings.Contains(resp.Body.String(), "invalid type for 'stop' field") {
-					t.Fatalf("error was not forwarded")
-				}
-			},
-		},
-	}
-
-	endpoint := func(c *gin.Context) {
-		c.Status(http.StatusOK)
-	}
-
-	gin.SetMode(gin.TestMode)
-	router := gin.New()
-	router.Use(CompletionsMiddleware(), captureRequestMiddleware(&capturedRequest))
-	router.Handle(http.MethodPost, "/api/generate", endpoint)
-
-	for _, tc := range testCases {
-		t.Run(tc.Name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/api/generate", nil)
-
-			tc.Setup(t, req)
-
-			resp := httptest.NewRecorder()
-			router.ServeHTTP(resp, req)
-
-			tc.Expected(t, capturedRequest, resp)
-
-			capturedRequest = nil
-		})
-	}
-}
-
-func TestEmbeddingsMiddleware(t *testing.T) {
-	type testCase struct {
-		Name     string
-		Setup    func(t *testing.T, req *http.Request)
-		Expected func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder)
-	}
-
-	var capturedRequest *api.EmbedRequest
-
-	testCases := []testCase{
-		{
-			Name: "embed handler single input",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := EmbedRequest{
-					Input: "Hello",
-					Model: "test-model",
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) {
-				if req.Input != "Hello" {
-					t.Fatalf("expected 'Hello', got %s", req.Input)
-				}
-
-				if req.Model != "test-model" {
-					t.Fatalf("expected 'test-model', got %s", req.Model)
-				}
-			},
-		},
-		{
-			Name: "embed handler batch input",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := EmbedRequest{
-					Input: []string{"Hello", "World"},
-					Model: "test-model",
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) {
-				input, ok := req.Input.([]any)
-
-				if !ok {
-					t.Fatalf("expected input to be a list")
-				}
-
-				if input[0].(string) != "Hello" {
-					t.Fatalf("expected 'Hello', got %s", input[0])
-				}
-
-				if input[1].(string) != "World" {
-					t.Fatalf("expected 'World', got %s", input[1])
-				}
-
-				if req.Model != "test-model" {
-					t.Fatalf("expected 'test-model', got %s", req.Model)
-				}
-			},
-		},
-		{
-			Name: "embed handler error forwarding",
-			Setup: func(t *testing.T, req *http.Request) {
-				body := EmbedRequest{
-					Model: "test-model",
-				}
-				prepareRequest(req, body)
-			},
-			Expected: func(t *testing.T, req *api.EmbedRequest, resp *httptest.ResponseRecorder) {
-				if resp.Code != http.StatusBadRequest {
-					t.Fatalf("expected 400, got %d", resp.Code)
-				}
-
-				if !strings.Contains(resp.Body.String(), "invalid input") {
-					t.Fatalf("error was not forwarded")
-				}
-			},
-		},
-	}
-
-	endpoint := func(c *gin.Context) {
-		c.Status(http.StatusOK)
-	}
-
-	gin.SetMode(gin.TestMode)
-	router := gin.New()
-	router.Use(EmbeddingsMiddleware(), captureRequestMiddleware(&capturedRequest))
-	router.Handle(http.MethodPost, "/api/embed", endpoint)
-
-	for _, tc := range testCases {
-		t.Run(tc.Name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/api/embed", nil)
-
-			tc.Setup(t, req)
-
-			resp := httptest.NewRecorder()
-			router.ServeHTTP(resp, req)
-
-			tc.Expected(t, capturedRequest, resp)
-
-			capturedRequest = nil
+			tc.Expected(t, capturedRequest)
 		})
 	}
 }
@ -409,6 +203,36 @@ func TestMiddlewareResponses(t *testing.T) {
 	}

 	testCases := []testCase{
+		{
+			Name:     "completions handler error forwarding",
+			Method:   http.MethodPost,
+			Path:     "/api/generate",
+			TestPath: "/api/generate",
+			Handler:  CompletionsMiddleware,
+			Endpoint: func(c *gin.Context) {
+				c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request"})
+			},
+			Setup: func(t *testing.T, req *http.Request) {
+				body := CompletionRequest{
+					Model:  "test-model",
+					Prompt: "Hello",
+				}
+
+				bodyBytes, _ := json.Marshal(body)
+
+				req.Body = io.NopCloser(bytes.NewReader(bodyBytes))
+				req.Header.Set("Content-Type", "application/json")
+			},
+			Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
+				if resp.Code != http.StatusBadRequest {
+					t.Fatalf("expected 400, got %d", resp.Code)
+				}
+
+				if !strings.Contains(resp.Body.String(), `"invalid request"`) {
+					t.Fatalf("error was not forwarded")
+				}
+			},
+		},
 		{
 			Name:     "list handler",
 			Method:   http.MethodGet,
@ -425,6 +249,8 @@ func TestMiddlewareResponses(t *testing.T) {
 				})
 			},
 			Expected: func(t *testing.T, resp *httptest.ResponseRecorder) {
+				assert.Equal(t, http.StatusOK, resp.Code)
+
 				var listResp ListCompletion
 				if err := json.NewDecoder(resp.Body).Decode(&listResp); err != nil {
 					t.Fatal(err)
@ -488,8 +314,6 @@ func TestMiddlewareResponses(t *testing.T) {
 			resp := httptest.NewRecorder()
 			router.ServeHTTP(resp, req)

-			assert.Equal(t, http.StatusOK, resp.Code)
-
 			tc.Expected(t, resp)
 		})
 	}
--- a/server/images.go
+++ b/server/images.go
@ -34,19 +34,13 @@ import (
 	"github.com/ollama/ollama/version"
 )

-var (
-	errCapabilities         = errors.New("does not support")
-	errCapabilityCompletion = errors.New("completion")
-	errCapabilityTools      = errors.New("tools")
-	errCapabilityInsert     = errors.New("insert")
-)
+var errCapabilityCompletion = errors.New("completion")

 type Capability string

 const (
 	CapabilityCompletion = Capability("completion")
 	CapabilityTools      = Capability("tools")
-	CapabilityInsert     = Capability("insert")
 )

 type registryOptions struct {
@ -99,12 +93,7 @@ func (m *Model) CheckCapabilities(caps ...Capability) error {
 			}
 		case CapabilityTools:
 			if !slices.Contains(m.Template.Vars(), "tools") {
-				errs = append(errs, errCapabilityTools)
-			}
-		case CapabilityInsert:
-			vars := m.Template.Vars()
-			if !slices.Contains(vars, "suffix") {
-				errs = append(errs, errCapabilityInsert)
+				errs = append(errs, errors.New("tools"))
 			}
 		default:
 			slog.Error("unknown capability", "capability", cap)
@ -113,7 +102,7 @@ func (m *Model) CheckCapabilities(caps ...Capability) error {
 	}

 	if err := errors.Join(errs...); err != nil {
-		return fmt.Errorf("%w %w", errCapabilities, errors.Join(errs...))
+		return fmt.Errorf("does not support %w", errors.Join(errs...))
 	}

 	return nil
@ -492,12 +481,6 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 				layers = append(layers, baseLayer.Layer)
 			}
 		case "license", "template", "system":
-			if c.Name == "template" {
-				if _, err := template.Parse(c.Args); err != nil {
-					return fmt.Errorf("%w: %s", errBadTemplate, err)
-				}
-			}
-
 			if c.Name != "license" {
 				// replace
 				layers = slices.DeleteFunc(layers, func(layer *Layer) bool {
--- a/server/model.go
+++ b/server/model.go
@ -16,6 +16,7 @@ import (
 	"strings"
 	"text/template/parse"

+	"github.com/google/uuid"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/convert"
 	"github.com/ollama/ollama/llm"
@ -311,14 +312,12 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 	}

 	var b bytes.Buffer
-	if err := tmpl.Execute(&b, map[string][]api.ToolCall{
+	if err := tmpl.Execute(&b, map[string][]map[string]any{
 		"ToolCalls": {
 			{
-				Function: api.ToolCallFunction{
-					Name: "@@name@@",
-					Arguments: api.ToolCallFunctionArguments{
-						"@@argument@@": 1,
-					},
+				"Function": map[string]any{
+					"Name":      "@@name@@",
+					"Arguments": "@@arguments@@",
 				},
 			},
 		},
@ -326,48 +325,57 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 		return nil, false
 	}

-	var kv map[string]any
+	var kv map[string]string
 	// execute the subtree with placeholders to identify the keys
-	// trim any commands that might exist in the template
-	if err := json.Unmarshal(bytes.TrimSuffix(b.Bytes(), []byte(",")), &kv); err != nil {
+	if err := json.Unmarshal(b.Bytes(), &kv); err != nil {
 		return nil, false
 	}

 	// find the keys that correspond to the name and arguments fields
 	var name, arguments string
 	for k, v := range kv {
-		switch v.(type) {
-		case string:
+		switch v {
+		case "@@name@@":
 			name = k
-		case map[string]any:
+		case "@@arguments@@":
 			arguments = k
 		}
 	}

-	var objs []map[string]any
-	for offset := 0; offset < len(s); {
-		var obj map[string]any
-		decoder := json.NewDecoder(strings.NewReader(s[offset:]))
-		if err := decoder.Decode(&obj); errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
-			break
-		} else if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
-			// skip over any syntax errors
-			offset += int(syntax.Offset)
-		} else if unmarshalType := &(json.UnmarshalTypeError{}); errors.As(err, &unmarshalType) {
-			// skip over any unmarshalable types
-			offset += int(unmarshalType.Offset)
-		} else if err != nil {
-			slog.Error("parseToolCalls", "error", err)
+	var sm []map[string]any
+	decoder := json.NewDecoder(strings.NewReader(s))
+	for {
+		// incrementally decode the JSON into a list of JSON objects
+		// skipping over any invalid tokens
+		if err := decoder.Decode(&sm); err != nil {
+			if errors.Is(err, io.EOF) {
+				break
+			}
+
+			if errors.As(err, new(*json.SyntaxError)) {
+				r := decoder.Buffered()
+				if _, err := r.Read(make([]byte, decoder.InputOffset()+1)); err != nil {
+					break
+				}
+
+				decoder = json.NewDecoder(r)
+				continue
+			}
+
 			return nil, false
-		} else {
-			offset += int(decoder.InputOffset())
-			objs = append(objs, obj)
 		}
+
+		// break as soon as a valid object is decoded
+		break
 	}

 	var toolCalls []api.ToolCall
-	for _, kv := range objs {
-		var call api.ToolCall
+	for _, kv := range sm {
+		call := api.ToolCall{
+			ID:   uuid.New().String(),
+			Type: "function",
+		}
+
 		for k, v := range kv {
 			switch k {
 			case name:
@ -380,5 +388,9 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 		toolCalls = append(toolCalls, call)
 	}

-	return toolCalls, len(toolCalls) > 0
+	if len(toolCalls) > 0 {
+		return toolCalls, true
+	}
+
+	return nil, false
 }
--- a/server/model_test.go
+++ b/server/model_test.go
@ -115,6 +115,11 @@ func TestExtractFromZipFile(t *testing.T) {
 	}
 }

+type function struct {
+	Name      string         `json:"name"`
+	Arguments map[string]any `json:"arguments"`
+}
+
 func readFile(t *testing.T, base, name string) *bytes.Buffer {
 	t.Helper()

@ -131,16 +136,11 @@ func TestExecuteWithTools(t *testing.T) {
 	cases := []struct {
 		model  string
 		output string
-		ok     bool
 	}{
-		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
+		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`},
 		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]

-The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, true},
-		{"mistral", `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function:
-
-		[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
-		{"mistral", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
+The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`},
 		{"command-r-plus", "Action: ```json" + `
 [
    {
@ -158,14 +158,8 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
        }
    }
 ]
-` + "```", true},
-		{"command-r-plus", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
-		{"firefunction", ` functools[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
-		{"firefunction", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
-		{"llama3-groq-tool-use", `<tool_call>
-{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
-{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
-</tool_call>`, true},
+` + "```"},
+		{"firefunction", ` functools[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`},
 	}

 	var tools []api.Tool
@ -180,18 +174,20 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,

 	calls := []api.ToolCall{
 		{
-			Function: api.ToolCallFunction{
+			Type: "function",
+			Function: function{
 				Name: "get_current_weather",
-				Arguments: api.ToolCallFunctionArguments{
+				Arguments: map[string]any{
 					"format":   "fahrenheit",
 					"location": "San Francisco, CA",
 				},
 			},
 		},
 		{
-			Function: api.ToolCallFunction{
+			Type: "function",
+			Function: function{
 				Name: "get_current_weather",
-				Arguments: api.ToolCallFunctionArguments{
+				Arguments: map[string]any{
 					"format":   "celsius",
 					"location": "Toronto, Canada",
 				},
@ -220,14 +216,17 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
 			t.Run("parse", func(t *testing.T) {
 				m := &Model{Template: tmpl}
 				actual, ok := m.parseToolCalls(tt.output)
-				if ok != tt.ok {
-					t.Fatalf("expected %t, got %t", tt.ok, ok)
+				if !ok {
+					t.Fatal("failed to parse tool calls")
 				}

-				if tt.ok {
-					if diff := cmp.Diff(actual, calls); diff != "" {
-						t.Errorf("mismatch (-got +want):\n%s", diff)
-					}
+				for i := range actual {
+					// ID is randomly generated so clear it for comparison
+					actual[i].ID = ""
+				}
+
+				if diff := cmp.Diff(actual, calls); diff != "" {
+					t.Errorf("mismatch (-got +want):\n%s", diff)
 				}
 			})
 		})
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@ -3,6 +3,7 @@ package server
 import (
 	"bytes"
 	"context"
+	"strings"
 	"testing"

 	"github.com/google/go-cmp/cmp"
@ -10,6 +11,14 @@ import (
 	"github.com/ollama/ollama/template"
 )

+func tokenize(_ context.Context, s string) (tokens []int, err error) {
+	for range strings.Fields(s) {
+		tokens = append(tokens, len(tokens))
+	}
+
+	return
+}
+
 func TestChatPrompt(t *testing.T) {
 	type expect struct {
 		prompt string
@ -183,11 +192,15 @@ func TestChatPrompt(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			model := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
 			opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}}
-			prompt, images, err := chatPrompt(context.TODO(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil)
+			prompt, images, err := chatPrompt(context.TODO(), &model, tokenize, &opts, tt.msgs, nil)
 			if err != nil {
 				t.Fatal(err)
 			}

+			if tt.prompt != prompt {
+				t.Errorf("expected %q, got %q", tt.prompt, prompt)
+			}
+
 			if diff := cmp.Diff(prompt, tt.prompt); diff != "" {
 				t.Errorf("mismatch (-got +want):\n%s", diff)
 			}
--- a/server/routes.go
+++ b/server/routes.go
@ -56,7 +56,6 @@ func init() {
 }

 var errRequired = errors.New("is required")
-var errBadTemplate = errors.New("template error")

 func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options, error) {
 	opts := api.DefaultOptions()
@ -123,10 +122,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	}

 	caps := []Capability{CapabilityCompletion}
-	if req.Suffix != "" {
-		caps = append(caps, CapabilityInsert)
-	}
-
 	r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, caps, req.Options, req.KeepAlive)
 	if errors.Is(err, errCapabilityCompletion) {
 		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support generate", req.Model)})
@ -155,6 +150,19 @@ func (s *Server) GenerateHandler(c *gin.Context) {

 	prompt := req.Prompt
 	if !req.Raw {
+		var msgs []api.Message
+		if req.System != "" {
+			msgs = append(msgs, api.Message{Role: "system", Content: req.System})
+		} else if m.System != "" {
+			msgs = append(msgs, api.Message{Role: "system", Content: m.System})
+		}
+
+		for _, i := range images {
+			msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)})
+		}
+
+		msgs = append(msgs, api.Message{Role: "user", Content: req.Prompt})
+
 		tmpl := m.Template
 		if req.Template != "" {
 			tmpl, err = template.Parse(req.Template)
@ -175,26 +183,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			b.WriteString(s)
 		}

-		var values template.Values
-		if req.Suffix != "" {
-			values.Prompt = prompt
-			values.Suffix = req.Suffix
-		} else {
-			var msgs []api.Message
-			if req.System != "" {
-				msgs = append(msgs, api.Message{Role: "system", Content: req.System})
-			} else if m.System != "" {
-				msgs = append(msgs, api.Message{Role: "system", Content: m.System})
-			}
-
-			for _, i := range images {
-				msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)})
-			}
-
-			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
-		}
-
-		if err := tmpl.Execute(&b, values); err != nil {
+		if err := tmpl.Execute(&b, template.Values{Messages: msgs}); err != nil {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			return
 		}
@ -276,6 +265,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		}

 		r.Response = sb.String()
+		if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
+			r.ToolCalls = toolCalls
+			r.Response = ""
+		}
+
 		c.JSON(http.StatusOK, r)
 		return
 	}
@ -610,11 +604,8 @@ func (s *Server) CreateModelHandler(c *gin.Context) {

 		quantization := cmp.Or(r.Quantize, r.Quantization)
 		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil {
-			if errors.Is(err, errBadTemplate) {
-			  ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
-			}
 			ch <- gin.H{"error": err.Error()}
-		  }
+		}
 	}()

 	if r.Stream != nil && !*r.Stream {
@ -1073,7 +1064,6 @@ func (s *Server) GenerateRoutes() http.Handler {
 	// Compatibility endpoints
 	r.POST("/v1/chat/completions", openai.ChatMiddleware(), s.ChatHandler)
 	r.POST("/v1/completions", openai.CompletionsMiddleware(), s.GenerateHandler)
-	r.POST("/v1/embeddings", openai.EmbeddingsMiddleware(), s.EmbedHandler)
 	r.GET("/v1/models", openai.ListMiddleware(), s.ListModelsHandler)
 	r.GET("/v1/models/:model", openai.RetrieveMiddleware(), s.ShowModelHandler)

@ -1200,15 +1190,11 @@ func waitForStream(c *gin.Context, ch chan interface{}) {
 				return
 			}
 		case gin.H:
-			status, ok := r["status"].(int)
-			if !ok {
-				status = http.StatusInternalServerError
-			}
 			if errorMsg, ok := r["error"].(string); ok {
-				c.JSON(status, gin.H{"error": errorMsg})
+				c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
 				return
 			} else {
-				c.JSON(status, gin.H{"error": "unexpected error format in progress response"})
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in progress response"})
 				return
 			}
 		default:
@ -1298,7 +1284,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}

 	caps := []Capability{CapabilityCompletion}
-	if len(req.Tools) > 0 {
+	if req.Tools != nil {
 		caps = append(caps, CapabilityTools)
 	}

@ -1324,7 +1310,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}

-	if req.Messages[0].Role != "system" && m.System != "" {
+	if req.Messages[0].Role != "system" {
 		req.Messages = append([]api.Message{{Role: "system", Content: m.System}}, req.Messages...)
 	}

@ -1393,12 +1379,9 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		}

 		resp.Message.Content = sb.String()
-
-		if len(req.Tools) > 0 {
-			if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
-				resp.Message.ToolCalls = toolCalls
-				resp.Message.Content = ""
-			}
+		if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
+			resp.Message.ToolCalls = toolCalls
+			resp.Message.Content = ""
 		}

 		c.JSON(http.StatusOK, resp)
@ -1410,7 +1393,7 @@ func (s *Server) ChatHandler(c *gin.Context) {

 func handleScheduleError(c *gin.Context, name string, err error) {
 	switch {
-	case errors.Is(err, errCapabilities), errors.Is(err, errRequired):
+	case errors.Is(err, errRequired):
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 	case errors.Is(err, context.Canceled):
 		c.JSON(499, gin.H{"error": "request canceled"})
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@ -85,8 +85,6 @@ func checkFileExists(t *testing.T, p string, expect []string) {
 }

 func TestCreateFromBin(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
 	envconfig.LoadConfig()
@ -113,8 +111,6 @@ func TestCreateFromBin(t *testing.T) {
 }

 func TestCreateFromModel(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
 	envconfig.LoadConfig()
@ -156,8 +152,6 @@ func TestCreateFromModel(t *testing.T) {
 }

 func TestCreateRemovesLayers(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
 	envconfig.LoadConfig()
@ -205,8 +199,6 @@ func TestCreateRemovesLayers(t *testing.T) {
 }

 func TestCreateUnsetsSystem(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
 	envconfig.LoadConfig()
@ -263,8 +255,6 @@ func TestCreateUnsetsSystem(t *testing.T) {
 }

 func TestCreateMergeParameters(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
 	envconfig.LoadConfig()
@ -368,8 +358,6 @@ func TestCreateMergeParameters(t *testing.T) {
 }

 func TestCreateReplacesMessages(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
 	envconfig.LoadConfig()
@ -446,8 +434,6 @@ func TestCreateReplacesMessages(t *testing.T) {
 }

 func TestCreateTemplateSystem(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
 	envconfig.LoadConfig()
@ -491,47 +477,9 @@ func TestCreateTemplateSystem(t *testing.T) {
 	if string(system) != "Say bye!" {
 		t.Errorf("expected \"Say bye!\", actual %s", system)
 	}
-
-	t.Run("incomplete template", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
-			Name:      "test",
-			Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ .Prompt", createBinFile(t, nil, nil)),
-			Stream:    &stream,
-		})
-	
-		if w.Code != http.StatusBadRequest {
-			t.Fatalf("expected status code 400, actual %d", w.Code)
-		}
-	})
-
-	t.Run("template with unclosed if", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
-			Name:      "test",
-			Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{ if .Prompt }}", createBinFile(t, nil, nil)),
-			Stream:    &stream,
-		})
-	
-		if w.Code != http.StatusBadRequest {
-			t.Fatalf("expected status code 400, actual %d", w.Code)
-		}
-	})
-
-	t.Run("template with undefined function", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
-			Name:      "test",
-			Modelfile: fmt.Sprintf("FROM %s\nTEMPLATE {{  Prompt }}", createBinFile(t, nil, nil)),
-			Stream:    &stream,
-		})
-	
-		if w.Code != http.StatusBadRequest {
-			t.Fatalf("expected status code 400, actual %d", w.Code)
-		}
-	})
 }

 func TestCreateLicenses(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
 	envconfig.LoadConfig()
@ -578,8 +526,6 @@ func TestCreateLicenses(t *testing.T) {
 }

 func TestCreateDetectTemplate(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
 	envconfig.LoadConfig()
--- a/server/routes_delete_test.go
+++ b/server/routes_delete_test.go
@ -8,15 +8,12 @@ import (
 	"path/filepath"
 	"testing"

-	"github.com/gin-gonic/gin"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/types/model"
 )

 func TestDelete(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
 	envconfig.LoadConfig()
@ -80,8 +77,6 @@ func TestDelete(t *testing.T) {
 }

 func TestDeleteDuplicateLayers(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
 	var s Server
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@ -1,714 +0,0 @@
-package server
-
-import (
-	"bytes"
-	"context"
-	"encoding/json"
-	"fmt"
-	"io"
-	"net/http"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/gin-gonic/gin"
-	"github.com/google/go-cmp/cmp"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/gpu"
-	"github.com/ollama/ollama/llm"
-)
-
-type mockRunner struct {
-	llm.LlamaServer
-
-	// CompletionRequest is only valid until the next call to Completion
-	llm.CompletionRequest
-	llm.CompletionResponse
-}
-
-func (m *mockRunner) Completion(_ context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
-	m.CompletionRequest = r
-	fn(m.CompletionResponse)
-	return nil
-}
-
-func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error) {
-	for range strings.Fields(s) {
-		tokens = append(tokens, len(tokens))
-	}
-
-	return
-}
-
-func newMockServer(mock *mockRunner) func(gpu.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
-	return func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
-		return mock, nil
-	}
-}
-
-func TestGenerateChat(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	mock := mockRunner{
-		CompletionResponse: llm.CompletionResponse{
-			Done:               true,
-			DoneReason:         "stop",
-			PromptEvalCount:    1,
-			PromptEvalDuration: 1,
-			EvalCount:          1,
-			EvalDuration:       1,
-		},
-	}
-
-	s := Server{
-		sched: &Scheduler{
-			pendingReqCh:  make(chan *LlmRequest, 1),
-			finishedReqCh: make(chan *LlmRequest, 1),
-			expiredCh:     make(chan *runnerRef, 1),
-			unloadedCh:    make(chan any, 1),
-			loaded:        make(map[string]*runnerRef),
-			newServerFn:   newMockServer(&mock),
-			getGpuFn:      gpu.GetGPUInfo,
-			getCpuFn:      gpu.GetCPUInfo,
-			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
-				// add small delay to simulate loading
-				time.Sleep(time.Millisecond)
-				req.successCh <- &runnerRef{
-					llama: &mock,
-				}
-			},
-		},
-	}
-
-	go s.sched.Run(context.TODO())
-
-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
-		Model: "test",
-		Modelfile: fmt.Sprintf(`FROM %s
-		TEMPLATE """
-{{- if .System }}System: {{ .System }} {{ end }}
-{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
-{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
-`, createBinFile(t, llm.KV{
-			"general.architecture":          "llama",
-			"llama.block_count":             uint32(1),
-			"llama.context_length":          uint32(8192),
-			"llama.embedding_length":        uint32(4096),
-			"llama.attention.head_count":    uint32(32),
-			"llama.attention.head_count_kv": uint32(8),
-			"tokenizer.ggml.tokens":         []string{""},
-			"tokenizer.ggml.scores":         []float32{0},
-			"tokenizer.ggml.token_type":     []int32{0},
-		}, []llm.Tensor{
-			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		})),
-		Stream: &stream,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	t.Run("missing body", func(t *testing.T) {
-		w := createRequest(t, s.ChatHandler, nil)
-		if w.Code != http.StatusBadRequest {
-			t.Errorf("expected status 400, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(w.Body.String(), `{"error":"model is required"}`); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	})
-
-	t.Run("missing model", func(t *testing.T) {
-		w := createRequest(t, s.ChatHandler, api.ChatRequest{})
-		if w.Code != http.StatusBadRequest {
-			t.Errorf("expected status 400, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(w.Body.String(), `{"error":"model is required"}`); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	})
-
-	t.Run("missing capabilities chat", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
-			Model: "bert",
-			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
-				"general.architecture": "bert",
-				"bert.pooling_type":    uint32(0),
-			}, []llm.Tensor{})),
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Fatalf("expected status 200, got %d", w.Code)
-		}
-
-		w = createRequest(t, s.ChatHandler, api.ChatRequest{
-			Model: "bert",
-		})
-
-		if w.Code != http.StatusBadRequest {
-			t.Errorf("expected status 400, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(w.Body.String(), `{"error":"\"bert\" does not support chat"}`); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	})
-
-	t.Run("load model", func(t *testing.T) {
-		w := createRequest(t, s.ChatHandler, api.ChatRequest{
-			Model: "test",
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		var actual api.ChatResponse
-		if err := json.NewDecoder(w.Body).Decode(&actual); err != nil {
-			t.Fatal(err)
-		}
-
-		if actual.Model != "test" {
-			t.Errorf("expected model test, got %s", actual.Model)
-		}
-
-		if !actual.Done {
-			t.Errorf("expected done true, got false")
-		}
-
-		if actual.DoneReason != "load" {
-			t.Errorf("expected done reason load, got %s", actual.DoneReason)
-		}
-	})
-
-	checkChatResponse := func(t *testing.T, body io.Reader, model, content string) {
-		t.Helper()
-
-		var actual api.ChatResponse
-		if err := json.NewDecoder(body).Decode(&actual); err != nil {
-			t.Fatal(err)
-		}
-
-		if actual.Model != model {
-			t.Errorf("expected model test, got %s", actual.Model)
-		}
-
-		if !actual.Done {
-			t.Errorf("expected done false, got true")
-		}
-
-		if actual.DoneReason != "stop" {
-			t.Errorf("expected done reason stop, got %s", actual.DoneReason)
-		}
-
-		if diff := cmp.Diff(actual.Message, api.Message{
-			Role:    "assistant",
-			Content: content,
-		}); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-
-		if actual.PromptEvalCount == 0 {
-			t.Errorf("expected prompt eval count > 0, got 0")
-		}
-
-		if actual.PromptEvalDuration == 0 {
-			t.Errorf("expected prompt eval duration > 0, got 0")
-		}
-
-		if actual.EvalCount == 0 {
-			t.Errorf("expected eval count > 0, got 0")
-		}
-
-		if actual.EvalDuration == 0 {
-			t.Errorf("expected eval duration > 0, got 0")
-		}
-
-		if actual.LoadDuration == 0 {
-			t.Errorf("expected load duration > 0, got 0")
-		}
-
-		if actual.TotalDuration == 0 {
-			t.Errorf("expected total duration > 0, got 0")
-		}
-	}
-
-	mock.CompletionResponse.Content = "Hi!"
-	t.Run("messages", func(t *testing.T) {
-		w := createRequest(t, s.ChatHandler, api.ChatRequest{
-			Model: "test",
-			Messages: []api.Message{
-				{Role: "user", Content: "Hello!"},
-			},
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "User: Hello! "); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-
-		checkChatResponse(t, w.Body, "test", "Hi!")
-	})
-
-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
-		Model:     "test-system",
-		Modelfile: "FROM test\nSYSTEM You are a helpful assistant.",
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	t.Run("messages with model system", func(t *testing.T) {
-		w := createRequest(t, s.ChatHandler, api.ChatRequest{
-			Model: "test-system",
-			Messages: []api.Message{
-				{Role: "user", Content: "Hello!"},
-			},
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! "); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-
-		checkChatResponse(t, w.Body, "test-system", "Hi!")
-	})
-
-	mock.CompletionResponse.Content = "Abra kadabra!"
-	t.Run("messages with system", func(t *testing.T) {
-		w := createRequest(t, s.ChatHandler, api.ChatRequest{
-			Model: "test-system",
-			Messages: []api.Message{
-				{Role: "system", Content: "You can perform magic tricks."},
-				{Role: "user", Content: "Hello!"},
-			},
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You can perform magic tricks. User: Hello! "); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-
-		checkChatResponse(t, w.Body, "test-system", "Abra kadabra!")
-	})
-
-	t.Run("messages with interleaved system", func(t *testing.T) {
-		w := createRequest(t, s.ChatHandler, api.ChatRequest{
-			Model: "test-system",
-			Messages: []api.Message{
-				{Role: "user", Content: "Hello!"},
-				{Role: "assistant", Content: "I can help you with that."},
-				{Role: "system", Content: "You can perform magic tricks."},
-				{Role: "user", Content: "Help me write tests."},
-			},
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! Assistant: I can help you with that. System: You can perform magic tricks. User: Help me write tests. "); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-
-		checkChatResponse(t, w.Body, "test-system", "Abra kadabra!")
-	})
-}
-
-func TestGenerate(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	mock := mockRunner{
-		CompletionResponse: llm.CompletionResponse{
-			Done:               true,
-			DoneReason:         "stop",
-			PromptEvalCount:    1,
-			PromptEvalDuration: 1,
-			EvalCount:          1,
-			EvalDuration:       1,
-		},
-	}
-
-	s := Server{
-		sched: &Scheduler{
-			pendingReqCh:  make(chan *LlmRequest, 1),
-			finishedReqCh: make(chan *LlmRequest, 1),
-			expiredCh:     make(chan *runnerRef, 1),
-			unloadedCh:    make(chan any, 1),
-			loaded:        make(map[string]*runnerRef),
-			newServerFn:   newMockServer(&mock),
-			getGpuFn:      gpu.GetGPUInfo,
-			getCpuFn:      gpu.GetCPUInfo,
-			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
-				// add small delay to simulate loading
-				time.Sleep(time.Millisecond)
-				req.successCh <- &runnerRef{
-					llama: &mock,
-				}
-			},
-		},
-	}
-
-	go s.sched.Run(context.TODO())
-
-	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
-		Model: "test",
-		Modelfile: fmt.Sprintf(`FROM %s
-		TEMPLATE """
-{{- if .System }}System: {{ .System }} {{ end }}
-{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
-{{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
-`, createBinFile(t, llm.KV{
-			"general.architecture":          "llama",
-			"llama.block_count":             uint32(1),
-			"llama.context_length":          uint32(8192),
-			"llama.embedding_length":        uint32(4096),
-			"llama.attention.head_count":    uint32(32),
-			"llama.attention.head_count_kv": uint32(8),
-			"tokenizer.ggml.tokens":         []string{""},
-			"tokenizer.ggml.scores":         []float32{0},
-			"tokenizer.ggml.token_type":     []int32{0},
-		}, []llm.Tensor{
-			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		})),
-		Stream: &stream,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	t.Run("missing body", func(t *testing.T) {
-		w := createRequest(t, s.GenerateHandler, nil)
-		if w.Code != http.StatusBadRequest {
-			t.Errorf("expected status 400, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(w.Body.String(), `{"error":"model is required"}`); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	})
-
-	t.Run("missing model", func(t *testing.T) {
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{})
-		if w.Code != http.StatusBadRequest {
-			t.Errorf("expected status 400, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(w.Body.String(), `{"error":"model is required"}`); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	})
-
-	t.Run("missing capabilities generate", func(t *testing.T) {
-		w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
-			Model: "bert",
-			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
-				"general.architecture": "bert",
-				"bert.pooling_type":    uint32(0),
-			}, []llm.Tensor{})),
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Fatalf("expected status 200, got %d", w.Code)
-		}
-
-		w = createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model: "bert",
-		})
-
-		if w.Code != http.StatusBadRequest {
-			t.Errorf("expected status 400, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(w.Body.String(), `{"error":"\"bert\" does not support generate"}`); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	})
-
-	t.Run("missing capabilities suffix", func(t *testing.T) {
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:  "test",
-			Prompt: "def add(",
-			Suffix: "    return c",
-		})
-
-		if w.Code != http.StatusBadRequest {
-			t.Errorf("expected status 400, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(w.Body.String(), `{"error":"test does not support insert"}`); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	})
-
-	t.Run("load model", func(t *testing.T) {
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model: "test",
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		var actual api.GenerateResponse
-		if err := json.NewDecoder(w.Body).Decode(&actual); err != nil {
-			t.Fatal(err)
-		}
-
-		if actual.Model != "test" {
-			t.Errorf("expected model test, got %s", actual.Model)
-		}
-
-		if !actual.Done {
-			t.Errorf("expected done true, got false")
-		}
-
-		if actual.DoneReason != "load" {
-			t.Errorf("expected done reason load, got %s", actual.DoneReason)
-		}
-	})
-
-	checkGenerateResponse := func(t *testing.T, body io.Reader, model, content string) {
-		t.Helper()
-
-		var actual api.GenerateResponse
-		if err := json.NewDecoder(body).Decode(&actual); err != nil {
-			t.Fatal(err)
-		}
-
-		if actual.Model != model {
-			t.Errorf("expected model test, got %s", actual.Model)
-		}
-
-		if !actual.Done {
-			t.Errorf("expected done false, got true")
-		}
-
-		if actual.DoneReason != "stop" {
-			t.Errorf("expected done reason stop, got %s", actual.DoneReason)
-		}
-
-		if actual.Response != content {
-			t.Errorf("expected response %s, got %s", content, actual.Response)
-		}
-
-		if actual.Context == nil {
-			t.Errorf("expected context not nil")
-		}
-
-		if actual.PromptEvalCount == 0 {
-			t.Errorf("expected prompt eval count > 0, got 0")
-		}
-
-		if actual.PromptEvalDuration == 0 {
-			t.Errorf("expected prompt eval duration > 0, got 0")
-		}
-
-		if actual.EvalCount == 0 {
-			t.Errorf("expected eval count > 0, got 0")
-		}
-
-		if actual.EvalDuration == 0 {
-			t.Errorf("expected eval duration > 0, got 0")
-		}
-
-		if actual.LoadDuration == 0 {
-			t.Errorf("expected load duration > 0, got 0")
-		}
-
-		if actual.TotalDuration == 0 {
-			t.Errorf("expected total duration > 0, got 0")
-		}
-	}
-
-	mock.CompletionResponse.Content = "Hi!"
-	t.Run("prompt", func(t *testing.T) {
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:  "test",
-			Prompt: "Hello!",
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "User: Hello! "); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-
-		checkGenerateResponse(t, w.Body, "test", "Hi!")
-	})
-
-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
-		Model:     "test-system",
-		Modelfile: "FROM test\nSYSTEM You are a helpful assistant.",
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	t.Run("prompt with model system", func(t *testing.T) {
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:  "test-system",
-			Prompt: "Hello!",
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You are a helpful assistant. User: Hello! "); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-
-		checkGenerateResponse(t, w.Body, "test-system", "Hi!")
-	})
-
-	mock.CompletionResponse.Content = "Abra kadabra!"
-	t.Run("prompt with system", func(t *testing.T) {
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:  "test-system",
-			Prompt: "Hello!",
-			System: "You can perform magic tricks.",
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "System: You can perform magic tricks. User: Hello! "); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-
-		checkGenerateResponse(t, w.Body, "test-system", "Abra kadabra!")
-	})
-
-	t.Run("prompt with template", func(t *testing.T) {
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:  "test-system",
-			Prompt: "Help me write tests.",
-			System: "You can perform magic tricks.",
-			Template: `{{- if .System }}{{ .System }} {{ end }}
-{{- if .Prompt }}### USER {{ .Prompt }} {{ end }}
-{{- if .Response }}### ASSISTANT {{ .Response }} {{ end }}`,
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "You can perform magic tricks. ### USER Help me write tests. "); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-
-		checkGenerateResponse(t, w.Body, "test-system", "Abra kadabra!")
-	})
-
-	w = createRequest(t, s.CreateModelHandler, api.CreateRequest{
-		Model: "test-suffix",
-		Modelfile: `FROM test
-TEMPLATE """{{- if .Suffix }}<PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID>
-{{- else }}{{ .Prompt }}
-{{- end }}"""`,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	t.Run("prompt with suffix", func(t *testing.T) {
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:  "test-suffix",
-			Prompt: "def add(",
-			Suffix: "    return c",
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "<PRE> def add( <SUF>    return c <MID>"); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	})
-
-	t.Run("prompt without suffix", func(t *testing.T) {
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:  "test-suffix",
-			Prompt: "def add(",
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "def add("); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	})
-
-	t.Run("raw", func(t *testing.T) {
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:  "test-system",
-			Prompt: "Help me write tests.",
-			Raw:    true,
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(mock.CompletionRequest.Prompt, "Help me write tests."); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	})
-}
--- a/server/routes_list_test.go
+++ b/server/routes_list_test.go
@ -7,14 +7,11 @@ import (
 	"slices"
 	"testing"

-	"github.com/gin-gonic/gin"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 )

 func TestList(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
 	t.Setenv("OLLAMA_MODELS", t.TempDir())
 	envconfig.LoadConfig()

--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -94,7 +94,7 @@ func TestLoad(t *testing.T) {
 	require.Len(t, s.expiredCh, 1)
 }

-type reqBundle struct {
+type bundle struct {
 	ctx     context.Context //nolint:containedctx
 	ctxDone func()
 	srv     *mockLlm
@ -102,13 +102,13 @@ type reqBundle struct {
 	ggml    *llm.GGML
 }

-func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 	return scenario.srv, nil
 }

-func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
-	b := &reqBundle{}
-	b.ctx, b.ctxDone = context.WithCancel(ctx)
+func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64) *bundle {
+	scenario := &bundle{}
+	scenario.ctx, scenario.ctxDone = context.WithCancel(ctx)
 	t.Helper()

 	f, err := os.CreateTemp(t.TempDir(), modelName)
@ -135,154 +135,124 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est

 	fname := f.Name()
 	model := &Model{Name: modelName, ModelPath: fname}
-	b.ggml, err = llm.LoadModel(model.ModelPath, 0)
+	scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
 	require.NoError(t, err)

-	if duration == nil {
-		duration = &api.Duration{Duration: 5 * time.Millisecond}
-	}
-	b.req = &LlmRequest{
-		ctx:             b.ctx,
+	scenario.req = &LlmRequest{
+		ctx:             scenario.ctx,
 		model:           model,
 		opts:            api.DefaultOptions(),
-		sessionDuration: duration,
+		sessionDuration: &api.Duration{Duration: 5 * time.Millisecond},
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
-	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
-	return b
+	scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
+	return scenario
 }

-func getGpuFn() gpu.GpuInfoList {
-	g := gpu.GpuInfo{Library: "metal"}
-	g.TotalMemory = 24 * format.GigaByte
-	g.FreeMemory = 12 * format.GigaByte
-	return []gpu.GpuInfo{g}
-}
-
-func getCpuFn() gpu.GpuInfoList {
-	g := gpu.GpuInfo{Library: "cpu"}
-	g.TotalMemory = 32 * format.GigaByte
-	g.FreeMemory = 26 * format.GigaByte
-	return []gpu.GpuInfo{g}
-}
-
-func TestRequestsSameModelSameRequest(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
+func TestRequests(t *testing.T) {
+	ctx, done := context.WithTimeout(context.Background(), 10*time.Second)
 	defer done()
-	s := InitScheduler(ctx)
-	s.getGpuFn = getGpuFn
-	s.getCpuFn = getCpuFn
-	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
-	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
-	b.req.model = a.req.model
-	b.ggml = a.ggml

-	s.newServerFn = a.newServer
-	slog.Info("a")
-	s.pendingReqCh <- a.req
+	// Same model, same request
+	scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
+	scenario1a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
+	scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
+	scenario1b.req.model = scenario1a.req.model
+	scenario1b.ggml = scenario1a.ggml
+	scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
+
+	// simple reload of same model
+	scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
+	tmpModel := *scenario1a.req.model
+	scenario2a.req.model = &tmpModel
+	scenario2a.ggml = scenario1a.ggml
+	scenario2a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
+
+	// Multiple loaded models
+	scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
+	scenario3b := newScenario(t, ctx, "ollama-model-3b", 24*format.GigaByte)
+	scenario3c := newScenario(t, ctx, "ollama-model-4a", 30)
+	scenario3c.req.opts.NumGPU = 0                           // CPU load, will be allowed
+	scenario3d := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
+
+	s := InitScheduler(ctx)
+	s.getGpuFn = func() gpu.GpuInfoList {
+		g := gpu.GpuInfo{Library: "metal"}
+		g.TotalMemory = 24 * format.GigaByte
+		g.FreeMemory = 12 * format.GigaByte
+		return []gpu.GpuInfo{g}
+	}
+	s.getCpuFn = func() gpu.GpuInfoList {
+		g := gpu.GpuInfo{Library: "cpu"}
+		g.TotalMemory = 32 * format.GigaByte
+		g.FreeMemory = 26 * format.GigaByte
+		return []gpu.GpuInfo{g}
+	}
+	s.newServerFn = scenario1a.newServer
+	slog.Info("scenario1a")
+	s.pendingReqCh <- scenario1a.req
 	require.Len(t, s.pendingReqCh, 1)
 	s.Run(ctx)
 	select {
-	case resp := <-a.req.successCh:
-		require.Equal(t, resp.llama, a.srv)
+	case resp := <-scenario1a.req.successCh:
+		require.Equal(t, resp.llama, scenario1a.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, a.req.errCh)
-	case err := <-a.req.errCh:
+		require.Empty(t, scenario1a.req.errCh)
+	case err := <-scenario1a.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}

 	// Same runner as first request due to not needing a reload
-	s.newServerFn = b.newServer
-	slog.Info("b")
-	s.pendingReqCh <- b.req
+	s.newServerFn = scenario1b.newServer
+	slog.Info("scenario1b")
+	s.pendingReqCh <- scenario1b.req
 	select {
-	case resp := <-b.req.successCh:
-		require.Equal(t, resp.llama, a.srv)
+	case resp := <-scenario1b.req.successCh:
+		require.Equal(t, resp.llama, scenario1a.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, b.req.errCh)
-	case err := <-b.req.errCh:
-		t.Fatal(err.Error())
-	case <-ctx.Done():
-		t.Fatal("timeout")
-	}
-}
-
-func TestRequestsSimpleReloadSameModel(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
-	defer done()
-	s := InitScheduler(ctx)
-	s.getGpuFn = getGpuFn
-	s.getCpuFn = getCpuFn
-	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
-	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
-	tmpModel := *a.req.model
-	b.req.model = &tmpModel
-	b.ggml = a.ggml
-
-	s.newServerFn = a.newServer
-	slog.Info("a")
-	s.pendingReqCh <- a.req
-	require.Len(t, s.pendingReqCh, 1)
-	s.Run(ctx)
-	select {
-	case resp := <-a.req.successCh:
-		require.Equal(t, resp.llama, a.srv)
-		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, a.req.errCh)
-	case err := <-a.req.errCh:
+		require.Empty(t, scenario1b.req.errCh)
+	case err := <-scenario1b.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}

 	// Trigger a reload
-	s.newServerFn = b.newServer
-	b.req.model.AdapterPaths = []string{"new"}
-	slog.Info("b")
-	s.pendingReqCh <- b.req
+	s.newServerFn = scenario2a.newServer
+	scenario2a.req.model.AdapterPaths = []string{"new"}
+	slog.Info("scenario2a")
+	s.pendingReqCh <- scenario2a.req
 	// finish first two requests, so model can reload
 	time.Sleep(1 * time.Millisecond)
-	a.ctxDone()
+	scenario1a.ctxDone()
+	scenario1b.ctxDone()
 	select {
-	case resp := <-b.req.successCh:
-		require.Equal(t, resp.llama, b.srv)
+	case resp := <-scenario2a.req.successCh:
+		require.Equal(t, resp.llama, scenario2a.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, b.req.errCh)
-	case err := <-b.req.errCh:
+		require.Empty(t, scenario2a.req.errCh)
+	case err := <-scenario2a.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
-}
-
-func TestRequestsMultipleLoadedModels(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
-	defer done()
-	s := InitScheduler(ctx)
-	s.getGpuFn = getGpuFn
-	s.getCpuFn = getCpuFn
-
-	// Multiple loaded models
-	a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
-	b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
-	c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
-	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
-	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded

 	envconfig.MaxRunners = 1
-	s.newServerFn = a.newServer
-	slog.Info("a")
-	s.pendingReqCh <- a.req
-	s.Run(ctx)
+	s.newServerFn = scenario3a.newServer
+	slog.Info("scenario3a")
+	s.pendingReqCh <- scenario3a.req
+	// finish prior request, so new model can load
+	time.Sleep(1 * time.Millisecond)
+	scenario2a.ctxDone()
 	select {
-	case resp := <-a.req.successCh:
-		require.Equal(t, resp.llama, a.srv)
+	case resp := <-scenario3a.req.successCh:
+		require.Equal(t, resp.llama, scenario3a.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, a.req.errCh)
-	case err := <-a.req.errCh:
+		require.Empty(t, scenario3a.req.errCh)
+	case err := <-scenario3a.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
@ -292,15 +262,15 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
 	s.loadedMu.Unlock()

 	envconfig.MaxRunners = 0
-	s.newServerFn = b.newServer
-	slog.Info("b")
-	s.pendingReqCh <- b.req
+	s.newServerFn = scenario3b.newServer
+	slog.Info("scenario3b")
+	s.pendingReqCh <- scenario3b.req
 	select {
-	case resp := <-b.req.successCh:
-		require.Equal(t, resp.llama, b.srv)
+	case resp := <-scenario3b.req.successCh:
+		require.Equal(t, resp.llama, scenario3b.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, b.req.errCh)
-	case err := <-b.req.errCh:
+		require.Empty(t, scenario3b.req.errCh)
+	case err := <-scenario3b.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
@ -310,15 +280,15 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
 	s.loadedMu.Unlock()

 	// This is a CPU load with NumGPU = 0 so it should load
-	s.newServerFn = c.newServer
-	slog.Info("c")
-	s.pendingReqCh <- c.req
+	s.newServerFn = scenario3c.newServer
+	slog.Info("scenario3c")
+	s.pendingReqCh <- scenario3c.req
 	select {
-	case resp := <-c.req.successCh:
-		require.Equal(t, resp.llama, c.srv)
+	case resp := <-scenario3c.req.successCh:
+		require.Equal(t, resp.llama, scenario3c.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, c.req.errCh)
-	case err := <-c.req.errCh:
+		require.Empty(t, scenario3c.req.errCh)
+	case err := <-scenario3c.req.errCh:
 		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
@ -328,25 +298,25 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
 	s.loadedMu.Unlock()

 	// Try to load a model that wont fit
-	s.newServerFn = d.newServer
-	slog.Info("d")
+	s.newServerFn = scenario3d.newServer
+	slog.Info("scenario3d")
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 3)
 	s.loadedMu.Unlock()
-	a.ctxDone() // Won't help since this one isn't big enough to make room
+	scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
 	time.Sleep(2 * time.Millisecond)
-	s.pendingReqCh <- d.req
+	s.pendingReqCh <- scenario3d.req
 	// finish prior request, so new model can load
 	time.Sleep(6 * time.Millisecond)
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 2)
 	s.loadedMu.Unlock()
-	b.ctxDone()
+	scenario3b.ctxDone()
 	select {
-	case resp := <-d.req.successCh:
-		require.Equal(t, resp.llama, d.srv)
+	case resp := <-scenario3d.req.successCh:
+		require.Equal(t, resp.llama, scenario3d.srv)
 		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, d.req.errCh)
+		require.Empty(t, scenario3d.req.errCh)
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
@ -359,19 +329,26 @@ func TestGetRunner(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()

-	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
-	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
-	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
+	scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
+	scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
+	scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
+	scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
+	scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
+	scenario1c.req.sessionDuration = &api.Duration{Duration: 0}
 	envconfig.MaxQueuedRequests = 1
 	s := InitScheduler(ctx)
-	s.getGpuFn = getGpuFn
-	s.getCpuFn = getCpuFn
-	s.newServerFn = a.newServer
-	slog.Info("a")
-	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
+	s.getGpuFn = func() gpu.GpuInfoList {
+		g := gpu.GpuInfo{Library: "metal"}
+		g.TotalMemory = 24 * format.GigaByte
+		g.FreeMemory = 12 * format.GigaByte
+		return []gpu.GpuInfo{g}
+	}
+	s.newServerFn = scenario1a.newServer
+	slog.Info("scenario1a")
+	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
 	require.Len(t, s.pendingReqCh, 1)
-	slog.Info("b")
-	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
+	slog.Info("scenario1b")
+	successCh1b, errCh1b := s.GetRunner(scenario1b.ctx, scenario1b.req.model, scenario1b.req.opts, scenario1b.req.sessionDuration)
 	require.Len(t, s.pendingReqCh, 1)
 	require.Empty(t, successCh1b)
 	require.Len(t, errCh1b, 1)
@ -380,24 +357,22 @@ func TestGetRunner(t *testing.T) {
 	s.Run(ctx)
 	select {
 	case resp := <-successCh1a:
-		require.Equal(t, resp.llama, a.srv)
+		require.Equal(t, resp.llama, scenario1a.srv)
 		require.Empty(t, s.pendingReqCh)
 		require.Empty(t, errCh1a)
-	case err := <-errCh1a:
-		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
-	a.ctxDone() // Set "a" model to idle so it can unload
+	scenario1a.ctxDone()
 	s.loadedMu.Lock()
 	require.Len(t, s.loaded, 1)
 	s.loadedMu.Unlock()

-	c.req.model.ModelPath = "bad path"
-	slog.Info("c")
-	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
+	scenario1c.req.model.ModelPath = "bad path"
+	slog.Info("scenario1c")
+	successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
 	// Starts in pending channel, then should be quickly processsed to return an error
-	time.Sleep(20 * time.Millisecond) // Long enough for the "a" model to expire and unload
+	time.Sleep(5 * time.Millisecond)
 	require.Empty(t, successCh1c)
 	s.loadedMu.Lock()
 	require.Empty(t, s.loaded)
@ -405,7 +380,7 @@ func TestGetRunner(t *testing.T) {
 	require.Len(t, errCh1c, 1)
 	err = <-errCh1c
 	require.Contains(t, err.Error(), "bad path")
-	b.ctxDone()
+	scenario1b.ctxDone()
 }

 // TODO - add one scenario that triggers the bogus finished event with positive ref count
@ -414,7 +389,7 @@ func TestPrematureExpired(t *testing.T) {
 	defer done()

 	// Same model, same request
-	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
+	scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
 	s := InitScheduler(ctx)
 	s.getGpuFn = func() gpu.GpuInfoList {
 		g := gpu.GpuInfo{Library: "metal"}
@ -436,8 +411,6 @@ func TestPrematureExpired(t *testing.T) {
 		s.loadedMu.Unlock()
 		slog.Info("sending premature expired event now")
 		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
-	case err := <-errCh1a:
-		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
@ -473,8 +446,6 @@ func TestUseLoadedRunner(t *testing.T) {
 	select {
 	case success := <-req.successCh:
 		require.Equal(t, r1, success)
-	case err := <-req.errCh:
-		t.Fatal(err.Error())
 	case <-ctx.Done():
 		t.Fatal("timeout")
 	}
@ -654,7 +625,8 @@ func TestAlreadyCanceled(t *testing.T) {
 	defer done()
 	dctx, done2 := context.WithCancel(ctx)
 	done2()
-	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
+	scenario1a := newScenario(t, dctx, "ollama-model-1", 10)
+	scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
 	s := InitScheduler(ctx)
 	slog.Info("scenario1a")
 	s.pendingReqCh <- scenario1a.req
--- a/server/testdata/tools/command-r-plus.gotmpl
+++ b/server/testdata/tools/command-r-plus.gotmpl
@ -46,7 +46,7 @@ Action: ```json
 {{- range .ToolCalls }}
    {
        "tool_name": "{{ .Function.Name }}",
-        "parameters": {{ .Function.Arguments }}
+        "parameters": {{ json .Function.Arguments }}
    }
 {{- end }}
 ]```
--- a/server/testdata/tools/firefunction.gotmpl
+++ b/server/testdata/tools/firefunction.gotmpl
@ -17,7 +17,7 @@ If you decide to call functions:

 Available functions as JSON spec:
 {{- if .Tools }}
-{{ .Tools }}
+{{ json .Tools }}
 {{- end }}<|eot_id|>
 {{- end }}
 {{- range .Messages }}<|start_header_id|>
@ -25,7 +25,7 @@ Available functions as JSON spec:
 {{- end }}<|end_header_id|>
 {{- if .Content }}{{ .Content }}
 {{- else if .ToolCalls }} functools[
-{{- range .ToolCalls }}{{ "{" }}"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}{{ "}" }}
+{{- range .ToolCalls }}{{ "{" }}"name": "{{ .Function.Name }}", "arguments": {{ json .Function.Arguments }}{{ "}" }}
 {{- end }}]
 {{- end }}<|eot_id|>
 {{- end }}<|start_header_id|>assistant<|end_header_id|>
--- a/server/testdata/tools/llama3-groq-tool-use.gotmpl
+++ b/server/testdata/tools/llama3-groq-tool-use.gotmpl
@ -1,43 +0,0 @@
-{{- if .Messages }}
-{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
-
-{{ .System }}
-{{- if .Tools }} You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
-<tool_call>
-{"name": <function-name>,"arguments": <args-dict>}
-</tool_call>
-
-Here are the available tools:
-<tools>
-{{- range .Tools }} {{ .Function }}
-{{- end }} </tools>
-{{- end }}
-{{- end }}<|eot_id|>
-{{- range .Messages }}
-{{- if ne .Role "system" }}<|start_header_id|>{{ .Role }}<|end_header_id|>
-
-{{ if eq .Role "user" }}{{ .Content }}
-{{- else if eq .Role "assistant" }}
-{{- if .Content }}{{ .Content }}
-{{- else if .ToolCalls }}<tool_call>
-{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
-{{- end }}
-</tool_call>
-{{- end }}
-{{- else if eq .Role "tool" }}<tool_response>
-{{ .Content }}
-</tool_response>
-{{- end }}<|eot_id|>
-{{- end }}
-{{- end }}<|start_header_id|>assistant<|end_header_id|>
-
-{{ else }}
-{{ if .System }}<|start_header_id|>system<|end_header_id|>
-
-{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
-
-{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
-
-{{ end }}{{ .Response }}
-{{- if .Response }}<|eot_id|>
-{{- end }}
--- a/server/testdata/tools/llama3-groq-tool-use.out
+++ b/server/testdata/tools/llama3-groq-tool-use.out
@ -1,24 +0,0 @@
-<|start_header_id|>system<|end_header_id|>
-
-You are a knowledgable assistant. You can answer questions and perform tasks. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
-<tool_call>
-{"name": <function-name>,"arguments": <args-dict>}
-</tool_call>
-
-Here are the available tools:
-<tools> {"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}} </tools><|eot_id|><|start_header_id|>user<|end_header_id|>
-
-What's the weather like today in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
-<tool_call>
-{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}
-</tool_call><|eot_id|><|start_header_id|>tool<|end_header_id|>
-
-<tool_response>
-22
-</tool_response><|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
-The current temperature in Paris, France is 22 degrees Celsius.<|eot_id|><|start_header_id|>user<|end_header_id|>
-
-What's the weather like today in San Francisco and Toronto?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
--- a/server/testdata/tools/mistral.gotmpl
+++ b/server/testdata/tools/mistral.gotmpl
@ -1,13 +1,13 @@
 {{- range $index, $_ := .Messages }}
 {{- if eq .Role "user" }}
-{{- if and (eq (len (slice $.Messages $index)) 1) $.Tools }}[AVAILABLE_TOOLS] {{ $.Tools }}[/AVAILABLE_TOOLS]
+{{- if and (eq (len (slice $.Messages $index)) 1) $.Tools }}[AVAILABLE_TOOLS] {{ json $.Tools }}[/AVAILABLE_TOOLS]
 {{- end }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}

 {{ end }}{{ .Content }}[/INST]
 {{- else if eq .Role "assistant" }}
 {{- if .Content }} {{ .Content }}</s>
 {{- else if .ToolCalls }}[TOOL_CALLS] [
-{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ json .Function.Arguments }}}
 {{- end }}]</s>
 {{- end }}
 {{- else if eq .Role "tool" }}[TOOL_RESULTS] {"content": {{ .Content }}}[/TOOL_RESULTS]
--- a/template/template.go
+++ b/template/template.go
@ -150,9 +150,7 @@ func (t *Template) Vars() []string {

 type Values struct {
 	Messages []api.Message
-	api.Tools
-	Prompt string
-	Suffix string
+	Tools    []api.Tool

 	// forceLegacy is a flag used to test compatibility with legacy templates
 	forceLegacy bool
@ -206,18 +204,11 @@ func (t *Template) Subtree(fn func(parse.Node) bool) *template.Template {

 func (t *Template) Execute(w io.Writer, v Values) error {
 	system, messages := collate(v.Messages)
-	if v.Prompt != "" && v.Suffix != "" {
-		return t.Template.Execute(w, map[string]any{
-			"Prompt":   v.Prompt,
-			"Suffix":   v.Suffix,
-			"Response": "",
-		})
-	} else if !v.forceLegacy && slices.Contains(t.Vars(), "messages") {
+	if !v.forceLegacy && slices.Contains(t.Vars(), "messages") {
 		return t.Template.Execute(w, map[string]any{
 			"System":   system,
 			"Messages": messages,
 			"Tools":    v.Tools,
-			"Response": "",
 		})
 	}

@ -264,7 +255,6 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 	nodes := deleteNode(t.Template.Root.Copy(), func(n parse.Node) bool {
 		if field, ok := n.(*parse.FieldNode); ok && slices.Contains(field.Ident, "Response") {
 			cut = true
-			return false
 		}

 		return cut
@ -272,9 +262,8 @@ func (t *Template) Execute(w io.Writer, v Values) error {

 	tree := parse.Tree{Root: nodes.(*parse.ListNode)}
 	if err := template.Must(template.New("").AddParseTree("", &tree)).Execute(&b, map[string]any{
-		"System":   system,
-		"Prompt":   prompt,
-		"Response": response,
+		"System": system,
+		"Prompt": prompt,
 	}); err != nil {
 		return err
 	}
--- a/template/template_test.go
+++ b/template/template_test.go
@ -260,26 +260,6 @@ func TestExecuteWithMessages(t *testing.T) {

 Hello friend![/INST] Hello human![INST] What is your name?[/INST] `,
 		},
-		{
-			"mistral assistant",
-			[]template{
-				{"no response", `[INST] {{ .Prompt }}[/INST] `},
-				{"response", `[INST] {{ .Prompt }}[/INST] {{ .Response }}`},
-				{"messages", `
-{{- range $i, $m := .Messages }}
-{{- if eq .Role "user" }}[INST] {{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }}{{ end }}
-{{- end }}`},
-			},
-			Values{
-				Messages: []api.Message{
-					{Role: "user", Content: "Hello friend!"},
-					{Role: "assistant", Content: "Hello human!"},
-					{Role: "user", Content: "What is your name?"},
-					{Role: "assistant", Content: "My name is Ollama and I"},
-				},
-			},
-			`[INST] Hello friend![/INST] Hello human![INST] What is your name?[/INST] My name is Ollama and I`,
-		},
 		{
 			"chatml",
 			[]template{
@ -379,38 +359,3 @@ Answer: `,
 		})
 	}
 }
-
-func TestExecuteWithSuffix(t *testing.T) {
-	tmpl, err := Parse(`{{- if .Suffix }}<PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID>
-{{- else }}{{ .Prompt }}
-{{- end }}`)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	cases := []struct {
-		name   string
-		values Values
-		expect string
-	}{
-		{
-			"message", Values{Messages: []api.Message{{Role: "user", Content: "hello"}}}, "hello",
-		},
-		{
-			"prompt suffix", Values{Prompt: "def add(", Suffix: "return x"}, "<PRE> def add( <SUF>return x <MID>",
-		},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.name, func(t *testing.T) {
-			var b bytes.Buffer
-			if err := tmpl.Execute(&b, tt.values); err != nil {
-				t.Fatal(err)
-			}
-
-			if diff := cmp.Diff(b.String(), tt.expect); diff != "" {
-				t.Errorf("mismatch (-got +want):\n%s", diff)
-			}
-		})
-	}
-}