Merge pull request #4215 from ollama/mxyng/mem
llm: add minimum based on layer size
This commit is contained in:
commit
70edb9bc4d
3 changed files with 7 additions and 7 deletions
|
@ -31,8 +31,8 @@ type handles struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
cudaMinimumMemory = 457 * format.MebiByte
|
cudaMinimumMemory = 256 * format.MebiByte
|
||||||
rocmMinimumMemory = 457 * format.MebiByte
|
rocmMinimumMemory = 256 * format.MebiByte
|
||||||
)
|
)
|
||||||
|
|
||||||
var gpuMutex sync.Mutex
|
var gpuMutex sync.Mutex
|
||||||
|
|
|
@ -15,7 +15,7 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
metalMinimumMemory = 512 * format.MebiByte
|
metalMinimumMemory = 384 * format.MebiByte
|
||||||
)
|
)
|
||||||
|
|
||||||
func GetGPUInfo() GpuInfoList {
|
func GetGPUInfo() GpuInfoList {
|
||||||
|
|
|
@ -85,19 +85,19 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
graphPartialOffload = graphFullOffload
|
graphPartialOffload = graphFullOffload
|
||||||
}
|
}
|
||||||
|
|
||||||
|
layers := ggml.Tensors().Layers()
|
||||||
|
|
||||||
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
||||||
memoryRequiredTotal := memoryMinimum + graphFullOffload
|
memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
|
||||||
|
|
||||||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
|
||||||
|
|
||||||
if memoryRequiredPartial > memoryAvailable {
|
if memoryRequiredPartial > memoryAvailable {
|
||||||
slog.Debug("insufficient VRAM to load any model layers")
|
slog.Debug("insufficient VRAM to load any model layers")
|
||||||
return 0, 0
|
return 0, 0
|
||||||
}
|
}
|
||||||
|
|
||||||
layers := ggml.Tensors().Layers()
|
|
||||||
|
|
||||||
var memoryLayerOutput uint64
|
var memoryLayerOutput uint64
|
||||||
if layer, ok := layers["output_norm"]; ok {
|
if layer, ok := layers["output_norm"]; ok {
|
||||||
memoryLayerOutput += layer.size()
|
memoryLayerOutput += layer.size()
|
||||||
|
|
Loading…
Reference in a new issue