Merge pull request #4215 from ollama/mxyng/mem
llm: add minimum based on layer size
This commit is contained in:
commit
70edb9bc4d
3 changed files with 7 additions and 7 deletions
|
@ -31,8 +31,8 @@ type handles struct {
|
|||
}
|
||||
|
||||
const (
|
||||
cudaMinimumMemory = 457 * format.MebiByte
|
||||
rocmMinimumMemory = 457 * format.MebiByte
|
||||
cudaMinimumMemory = 256 * format.MebiByte
|
||||
rocmMinimumMemory = 256 * format.MebiByte
|
||||
)
|
||||
|
||||
var gpuMutex sync.Mutex
|
||||
|
|
|
@ -15,7 +15,7 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
metalMinimumMemory = 512 * format.MebiByte
|
||||
metalMinimumMemory = 384 * format.MebiByte
|
||||
)
|
||||
|
||||
func GetGPUInfo() GpuInfoList {
|
||||
|
|
|
@ -85,19 +85,19 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||
graphPartialOffload = graphFullOffload
|
||||
}
|
||||
|
||||
layers := ggml.Tensors().Layers()
|
||||
|
||||
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
||||
memoryRequiredTotal := memoryMinimum + graphFullOffload
|
||||
memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
|
||||
|
||||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
|
||||
|
||||
if memoryRequiredPartial > memoryAvailable {
|
||||
slog.Debug("insufficient VRAM to load any model layers")
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
layers := ggml.Tensors().Layers()
|
||||
|
||||
var memoryLayerOutput uint64
|
||||
if layer, ok := layers["output_norm"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
|
|
Loading…
Reference in a new issue