use runner if cuda alloc won't fit

2024-01-09 00:44:34 -05:00 · 2024-01-09 00:44:34 -05:00 · f387e9631b
commit f387e9631b
parent 6566387ae3
1 changed files with 3 additions and 6 deletions
--- a/llm/llm.go
+++ b/llm/llm.go
@ -100,10 +100,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 				break
 			}

-			// This handles two cases:
-			// 1. overhead + tensors are always loaded into scratch memory even with num_gpu 0
-			// 2. it seems llama.cpp always tries to allocate the entire kv cache (even if later split into layers) into vram or crashes
-			if requiredAlloc > available || requiredKv > available {
+			// requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
+			if requiredAlloc > available {
 				log.Printf("not enough vram available, falling back to CPU only")
 				library = "cpu"
 				opts.NumGPU = 0
@ -127,8 +125,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 	opts.NumGQA = 0
 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
-	gpuInfo := gpu.GetGPUInfo()
-	return newLlmServer(gpuInfo.Library, model, adapters, projectors, opts)
+	return newLlmServer(library, model, adapters, projectors, opts)
 }

 // Give any native cgo implementations an opportunity to initialize