offload 75% of available vram to improve stability (#921)

2023-10-26 20:49:55 -04:00 · 2023-10-26 20:49:55 -04:00 · 2665f3c28e
commit 2665f3c28e
parent a79f030e75
1 changed files with 7 additions and 4 deletions
--- a/llm/llama.go
+++ b/llm/llama.go
@ -243,12 +243,15 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 			return 0
 		}
-		// Calculate bytes per layer
+		/*
-		// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
+		 Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
 		 We can store the model weights and the kv cache in vram,
 		 to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
 		*/
 		bytesPerLayer := fileSizeBytes / numLayer
-		// max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory
+		// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
-		layers := int(freeBytes/bytesPerLayer) * 92 / 100
+		layers := int(freeBytes/bytesPerLayer) * 3 / 4
 		log.Printf("%d MB VRAM available, loading up to %d GPU layers", freeBytes/(1024*1024), layers)
 		return layers