diff --git a/api/types.go b/api/types.go index df7bab21..d09ad06c 100644 --- a/api/types.go +++ b/api/types.go @@ -236,7 +236,7 @@ type Runner struct { NumGPU int `json:"num_gpu,omitempty"` MainGPU int `json:"main_gpu,omitempty"` LowVRAM bool `json:"low_vram,omitempty"` - F16KV bool `json:"f16_kv,omitempty"` + F16KV bool `json:"f16_kv,omitempty"` // Deprecated: This option is ignored LogitsAll bool `json:"logits_all,omitempty"` VocabOnly bool `json:"vocab_only,omitempty"` UseMMap *bool `json:"use_mmap,omitempty"` @@ -613,7 +613,6 @@ func DefaultOptions() Options { NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically NumThread: 0, // let the runtime decide LowVRAM: false, - F16KV: true, UseMLock: false, UseMMap: nil, }, diff --git a/docs/api.md b/docs/api.md index fe2eb82c..2836d73f 100644 --- a/docs/api.md +++ b/docs/api.md @@ -355,7 +355,6 @@ curl http://localhost:11434/api/generate -d '{ "num_gpu": 1, "main_gpu": 0, "low_vram": false, - "f16_kv": true, "vocab_only": false, "use_mmap": true, "use_mlock": false, diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 33900bd2..0a37dee0 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -837,14 +837,8 @@ func main() { mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") - // Expose requirements as a JSON output to stdout requirements := flag.Bool("requirements", false, "print json requirement information") - // These are either ignored by llama.cpp or have no significance to us - _ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)") - _ = flag.Bool("log-disable", false, "disables logging to a file") - _ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality") - flag.Parse() if *requirements { printRequirements(os.Stdout) diff --git a/llm/server.go b/llm/server.go index a4c99dd9..5ca6aa32 100644 --- a/llm/server.go +++ b/llm/server.go @@ -186,7 +186,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter "--model", model, "--ctx-size", strconv.Itoa(opts.NumCtx), "--batch-size", strconv.Itoa(opts.NumBatch), - "--embedding", } if opts.NumGPU >= 0 { @@ -218,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter params = append(params, "--threads", strconv.Itoa(defaultThreads)) } - if !opts.F16KV { - params = append(params, "--memory-f32") - } - flashAttnEnabled := envconfig.FlashAttention() for _, g := range gpus { diff --git a/parser/parser_test.go b/parser/parser_test.go index ebd8a7ff..6a4d853f 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -440,7 +440,6 @@ func TestParseFileParameters(t *testing.T) { "num_gpu 1": {"num_gpu", "1"}, "main_gpu 1": {"main_gpu", "1"}, "low_vram true": {"low_vram", "true"}, - "f16_kv true": {"f16_kv", "true"}, "logits_all true": {"logits_all", "true"}, "vocab_only true": {"vocab_only", "true"}, "use_mmap true": {"use_mmap", "true"},