runner.go: Remove unused arguments

Now that server.cpp is gone, we don't need to keep passing arguments that were only ignored and only kept for compatibility.
2024-10-30 16:54:49 -07:00 · 2024-10-30 16:54:49 -07:00 · a909417602
commit a909417602
parent 6cd566872b
5 changed files with 1 additions and 15 deletions
--- a/api/types.go
+++ b/api/types.go
@ -236,7 +236,7 @@ type Runner struct {
 	NumGPU    int   `json:"num_gpu,omitempty"`
 	MainGPU   int   `json:"main_gpu,omitempty"`
 	LowVRAM   bool  `json:"low_vram,omitempty"`
-	F16KV     bool  `json:"f16_kv,omitempty"`
+	F16KV     bool  `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
 	LogitsAll bool  `json:"logits_all,omitempty"`
 	VocabOnly bool  `json:"vocab_only,omitempty"`
 	UseMMap   *bool `json:"use_mmap,omitempty"`
@ -613,7 +613,6 @@ func DefaultOptions() Options {
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
 			NumThread: 0,  // let the runtime decide
 			LowVRAM:   false,
-			F16KV:     true,
 			UseMLock:  false,
 			UseMMap:   nil,
 		},
--- a/docs/api.md
+++ b/docs/api.md
@ -355,7 +355,6 @@ curl http://localhost:11434/api/generate -d '{
    "num_gpu": 1,
    "main_gpu": 0,
    "low_vram": false,
-    "f16_kv": true,
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@ -837,14 +837,8 @@ func main() {
 	mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
-	// Expose requirements as a JSON output to stdout
 	requirements := flag.Bool("requirements", false, "print json requirement information")

-	// These are either ignored by llama.cpp or have no significance to us
-	_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
-	_ = flag.Bool("log-disable", false, "disables logging to a file")
-	_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
-
 	flag.Parse()
 	if *requirements {
 		printRequirements(os.Stdout)
--- a/llm/server.go
+++ b/llm/server.go
@ -186,7 +186,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		"--model", model,
 		"--ctx-size", strconv.Itoa(opts.NumCtx),
 		"--batch-size", strconv.Itoa(opts.NumBatch),
-		"--embedding",
 	}

 	if opts.NumGPU >= 0 {
@ -218,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		params = append(params, "--threads", strconv.Itoa(defaultThreads))
 	}

-	if !opts.F16KV {
-		params = append(params, "--memory-f32")
-	}
-
 	flashAttnEnabled := envconfig.FlashAttention()

 	for _, g := range gpus {
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@ -440,7 +440,6 @@ func TestParseFileParameters(t *testing.T) {
 		"num_gpu 1":                    {"num_gpu", "1"},
 		"main_gpu 1":                   {"main_gpu", "1"},
 		"low_vram true":                {"low_vram", "true"},
-		"f16_kv true":                  {"f16_kv", "true"},
 		"logits_all true":              {"logits_all", "true"},
 		"vocab_only true":              {"vocab_only", "true"},
 		"use_mmap true":                {"use_mmap", "true"},