diff --git a/llm/memory.go b/llm/memory.go index dc620828..7ac7d8e7 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -5,7 +5,6 @@ import ( "log/slog" "os" "strconv" - "strings" "github.com/ollama/ollama/api" "github.com/ollama/ollama/format" @@ -100,8 +99,22 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts return 0, 0 } - var layerCount int layers := ggml.Tensors().Layers() + + var memoryLayerOutput uint64 + for k, v := range layers { + if k == "output" || k == "output_norm" { + memoryLayerOutput += v.size() + } + } + + if gpus[0].Library == "metal" && opts.UseMMap { + // memory is preallocated for output tensors + memoryRequiredTotal += memoryLayerOutput + memoryRequiredPartial += memoryLayerOutput + } + + var layerCount int for i := 0; i < int(ggml.KV().BlockCount()); i++ { memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size() @@ -115,15 +128,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts } } - var memoryLayerOutput uint64 - for k, v := range layers { - if !strings.HasPrefix(k, "blk.") { - memoryLayerOutput += v.size() - } + if gpus[0].Library != "metal" || !opts.UseMMap { + // memory was not preallocated for output tensors + memoryRequiredTotal += memoryLayerOutput } - memoryRequiredTotal += memoryLayerOutput - if memoryAvailable > memoryRequiredTotal { layerCount = int(ggml.KV().BlockCount()) + 1 memoryRequiredPartial = memoryRequiredTotal