Merge pull request #4215 from ollama/mxyng/mem

llm: add minimum based on layer size
This commit is contained in:
Michael Yang 2024-05-07 09:26:33 -07:00 committed by GitHub
commit 70edb9bc4d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 7 additions and 7 deletions

View file

@ -31,8 +31,8 @@ type handles struct {
} }
const ( const (
cudaMinimumMemory = 457 * format.MebiByte cudaMinimumMemory = 256 * format.MebiByte
rocmMinimumMemory = 457 * format.MebiByte rocmMinimumMemory = 256 * format.MebiByte
) )
var gpuMutex sync.Mutex var gpuMutex sync.Mutex

View file

@ -15,7 +15,7 @@ import (
) )
const ( const (
metalMinimumMemory = 512 * format.MebiByte metalMinimumMemory = 384 * format.MebiByte
) )
func GetGPUInfo() GpuInfoList { func GetGPUInfo() GpuInfoList {

View file

@ -85,19 +85,19 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
graphPartialOffload = graphFullOffload graphPartialOffload = graphFullOffload
} }
layers := ggml.Tensors().Layers()
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers) // memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers) // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
if memoryRequiredPartial > memoryAvailable { if memoryRequiredPartial > memoryAvailable {
slog.Debug("insufficient VRAM to load any model layers") slog.Debug("insufficient VRAM to load any model layers")
return 0, 0 return 0, 0
} }
layers := ggml.Tensors().Layers()
var memoryLayerOutput uint64 var memoryLayerOutput uint64
if layer, ok := layers["output_norm"]; ok { if layer, ok := layers["output_norm"]; ok {
memoryLayerOutput += layer.size() memoryLayerOutput += layer.size()