offload 75% of available vram to improve stability (#921)
This commit is contained in:
parent
a79f030e75
commit
2665f3c28e
1 changed files with 7 additions and 4 deletions
11
llm/llama.go
11
llm/llama.go
|
@ -243,12 +243,15 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate bytes per layer
|
/*
|
||||||
// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
|
Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
|
||||||
|
We can store the model weights and the kv cache in vram,
|
||||||
|
to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
|
||||||
|
*/
|
||||||
bytesPerLayer := fileSizeBytes / numLayer
|
bytesPerLayer := fileSizeBytes / numLayer
|
||||||
|
|
||||||
// max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory
|
// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
|
||||||
layers := int(freeBytes/bytesPerLayer) * 92 / 100
|
layers := int(freeBytes/bytesPerLayer) * 3 / 4
|
||||||
log.Printf("%d MB VRAM available, loading up to %d GPU layers", freeBytes/(1024*1024), layers)
|
log.Printf("%d MB VRAM available, loading up to %d GPU layers", freeBytes/(1024*1024), layers)
|
||||||
|
|
||||||
return layers
|
return layers
|
||||||
|
|
Loading…
Add table
Reference in a new issue