offload 75% of available vram to improve stability (#921)

This commit is contained in:
Bruce MacDonald 2023-10-26 20:49:55 -04:00 committed by GitHub
parent a79f030e75
commit 2665f3c28e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -243,12 +243,15 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
return 0
}
// Calculate bytes per layer
// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
/*
Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
We can store the model weights and the kv cache in vram,
to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
*/
bytesPerLayer := fileSizeBytes / numLayer
// max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory
layers := int(freeBytes/bytesPerLayer) * 92 / 100
// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
layers := int(freeBytes/bytesPerLayer) * 3 / 4
log.Printf("%d MB VRAM available, loading up to %d GPU layers", freeBytes/(1024*1024), layers)
return layers