From a9094176024a5fa49b3e23f5d67001481a434d63 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 30 Oct 2024 16:54:49 -0700
Subject: [PATCH] runner.go: Remove unused arguments

Now that server.cpp is gone, we don't need to keep passing arguments
that were only ignored and only kept for compatibility.
---
 api/types.go           | 3 +--
 docs/api.md            | 1 -
 llama/runner/runner.go | 6 ------
 llm/server.go          | 5 -----
 parser/parser_test.go  | 1 -
 5 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/api/types.go b/api/types.go
index df7bab21..d09ad06c 100644
--- a/api/types.go
+++ b/api/types.go
@@ -236,7 +236,7 @@ type Runner struct {
 	NumGPU    int   `json:"num_gpu,omitempty"`
 	MainGPU   int   `json:"main_gpu,omitempty"`
 	LowVRAM   bool  `json:"low_vram,omitempty"`
-	F16KV     bool  `json:"f16_kv,omitempty"`
+	F16KV     bool  `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
 	LogitsAll bool  `json:"logits_all,omitempty"`
 	VocabOnly bool  `json:"vocab_only,omitempty"`
 	UseMMap   *bool `json:"use_mmap,omitempty"`
@@ -613,7 +613,6 @@ func DefaultOptions() Options {
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
 			NumThread: 0,  // let the runtime decide
 			LowVRAM:   false,
-			F16KV:     true,
 			UseMLock:  false,
 			UseMMap:   nil,
 		},
diff --git a/docs/api.md b/docs/api.md
index fe2eb82c..2836d73f 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -355,7 +355,6 @@ curl http://localhost:11434/api/generate -d '{
     "num_gpu": 1,
     "main_gpu": 0,
     "low_vram": false,
-    "f16_kv": true,
     "vocab_only": false,
     "use_mmap": true,
     "use_mlock": false,
diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index 33900bd2..0a37dee0 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -837,14 +837,8 @@ func main() {
 	mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
-	// Expose requirements as a JSON output to stdout
 	requirements := flag.Bool("requirements", false, "print json requirement information")
 
-	// These are either ignored by llama.cpp or have no significance to us
-	_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
-	_ = flag.Bool("log-disable", false, "disables logging to a file")
-	_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
-
 	flag.Parse()
 	if *requirements {
 		printRequirements(os.Stdout)
diff --git a/llm/server.go b/llm/server.go
index a4c99dd9..5ca6aa32 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -186,7 +186,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		"--model", model,
 		"--ctx-size", strconv.Itoa(opts.NumCtx),
 		"--batch-size", strconv.Itoa(opts.NumBatch),
-		"--embedding",
 	}
 
 	if opts.NumGPU >= 0 {
@@ -218,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		params = append(params, "--threads", strconv.Itoa(defaultThreads))
 	}
 
-	if !opts.F16KV {
-		params = append(params, "--memory-f32")
-	}
-
 	flashAttnEnabled := envconfig.FlashAttention()
 
 	for _, g := range gpus {
diff --git a/parser/parser_test.go b/parser/parser_test.go
index ebd8a7ff..6a4d853f 100644
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -440,7 +440,6 @@ func TestParseFileParameters(t *testing.T) {
 		"num_gpu 1":                    {"num_gpu", "1"},
 		"main_gpu 1":                   {"main_gpu", "1"},
 		"low_vram true":                {"low_vram", "true"},
-		"f16_kv true":                  {"f16_kv", "true"},
 		"logits_all true":              {"logits_all", "true"},
 		"vocab_only true":              {"vocab_only", "true"},
 		"use_mmap true":                {"use_mmap", "true"},