runner.go: Remove unused arguments
Now that server.cpp is gone, we don't need to keep passing arguments that were only ignored and only kept for compatibility.
This commit is contained in:
parent
6cd566872b
commit
a909417602
5 changed files with 1 additions and 15 deletions
|
@ -236,7 +236,7 @@ type Runner struct {
|
||||||
NumGPU int `json:"num_gpu,omitempty"`
|
NumGPU int `json:"num_gpu,omitempty"`
|
||||||
MainGPU int `json:"main_gpu,omitempty"`
|
MainGPU int `json:"main_gpu,omitempty"`
|
||||||
LowVRAM bool `json:"low_vram,omitempty"`
|
LowVRAM bool `json:"low_vram,omitempty"`
|
||||||
F16KV bool `json:"f16_kv,omitempty"`
|
F16KV bool `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
|
||||||
LogitsAll bool `json:"logits_all,omitempty"`
|
LogitsAll bool `json:"logits_all,omitempty"`
|
||||||
VocabOnly bool `json:"vocab_only,omitempty"`
|
VocabOnly bool `json:"vocab_only,omitempty"`
|
||||||
UseMMap *bool `json:"use_mmap,omitempty"`
|
UseMMap *bool `json:"use_mmap,omitempty"`
|
||||||
|
@ -613,7 +613,6 @@ func DefaultOptions() Options {
|
||||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||||
NumThread: 0, // let the runtime decide
|
NumThread: 0, // let the runtime decide
|
||||||
LowVRAM: false,
|
LowVRAM: false,
|
||||||
F16KV: true,
|
|
||||||
UseMLock: false,
|
UseMLock: false,
|
||||||
UseMMap: nil,
|
UseMMap: nil,
|
||||||
},
|
},
|
||||||
|
|
|
@ -355,7 +355,6 @@ curl http://localhost:11434/api/generate -d '{
|
||||||
"num_gpu": 1,
|
"num_gpu": 1,
|
||||||
"main_gpu": 0,
|
"main_gpu": 0,
|
||||||
"low_vram": false,
|
"low_vram": false,
|
||||||
"f16_kv": true,
|
|
||||||
"vocab_only": false,
|
"vocab_only": false,
|
||||||
"use_mmap": true,
|
"use_mmap": true,
|
||||||
"use_mlock": false,
|
"use_mlock": false,
|
||||||
|
|
|
@ -837,14 +837,8 @@ func main() {
|
||||||
mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
||||||
tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||||
multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
||||||
// Expose requirements as a JSON output to stdout
|
|
||||||
requirements := flag.Bool("requirements", false, "print json requirement information")
|
requirements := flag.Bool("requirements", false, "print json requirement information")
|
||||||
|
|
||||||
// These are either ignored by llama.cpp or have no significance to us
|
|
||||||
_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
|
|
||||||
_ = flag.Bool("log-disable", false, "disables logging to a file")
|
|
||||||
_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
|
|
||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
if *requirements {
|
if *requirements {
|
||||||
printRequirements(os.Stdout)
|
printRequirements(os.Stdout)
|
||||||
|
|
|
@ -186,7 +186,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||||
"--model", model,
|
"--model", model,
|
||||||
"--ctx-size", strconv.Itoa(opts.NumCtx),
|
"--ctx-size", strconv.Itoa(opts.NumCtx),
|
||||||
"--batch-size", strconv.Itoa(opts.NumBatch),
|
"--batch-size", strconv.Itoa(opts.NumBatch),
|
||||||
"--embedding",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.NumGPU >= 0 {
|
if opts.NumGPU >= 0 {
|
||||||
|
@ -218,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||||
params = append(params, "--threads", strconv.Itoa(defaultThreads))
|
params = append(params, "--threads", strconv.Itoa(defaultThreads))
|
||||||
}
|
}
|
||||||
|
|
||||||
if !opts.F16KV {
|
|
||||||
params = append(params, "--memory-f32")
|
|
||||||
}
|
|
||||||
|
|
||||||
flashAttnEnabled := envconfig.FlashAttention()
|
flashAttnEnabled := envconfig.FlashAttention()
|
||||||
|
|
||||||
for _, g := range gpus {
|
for _, g := range gpus {
|
||||||
|
|
|
@ -440,7 +440,6 @@ func TestParseFileParameters(t *testing.T) {
|
||||||
"num_gpu 1": {"num_gpu", "1"},
|
"num_gpu 1": {"num_gpu", "1"},
|
||||||
"main_gpu 1": {"main_gpu", "1"},
|
"main_gpu 1": {"main_gpu", "1"},
|
||||||
"low_vram true": {"low_vram", "true"},
|
"low_vram true": {"low_vram", "true"},
|
||||||
"f16_kv true": {"f16_kv", "true"},
|
|
||||||
"logits_all true": {"logits_all", "true"},
|
"logits_all true": {"logits_all", "true"},
|
||||||
"vocab_only true": {"vocab_only", "true"},
|
"vocab_only true": {"vocab_only", "true"},
|
||||||
"use_mmap true": {"use_mmap", "true"},
|
"use_mmap true": {"use_mmap", "true"},
|
||||||
|
|
Loading…
Reference in a new issue