only count output tensors

This commit is contained in:
Michael Yang 2024-04-25 14:41:50 -07:00
parent 5f73c08729
commit 7bb7cb8a60

View file

@ -5,7 +5,6 @@ import (
"log/slog" "log/slog"
"os" "os"
"strconv" "strconv"
"strings"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
@ -100,8 +99,22 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
return 0, 0 return 0, 0
} }
var layerCount int
layers := ggml.Tensors().Layers() layers := ggml.Tensors().Layers()
var memoryLayerOutput uint64
for k, v := range layers {
if k == "output" || k == "output_norm" {
memoryLayerOutput += v.size()
}
}
if gpus[0].Library == "metal" && opts.UseMMap {
// memory is preallocated for output tensors
memoryRequiredTotal += memoryLayerOutput
memoryRequiredPartial += memoryLayerOutput
}
var layerCount int
for i := 0; i < int(ggml.KV().BlockCount()); i++ { for i := 0; i < int(ggml.KV().BlockCount()); i++ {
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size() memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
@ -115,15 +128,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
} }
} }
var memoryLayerOutput uint64 if gpus[0].Library != "metal" || !opts.UseMMap {
for k, v := range layers { // memory was not preallocated for output tensors
if !strings.HasPrefix(k, "blk.") { memoryRequiredTotal += memoryLayerOutput
memoryLayerOutput += v.size()
}
} }
memoryRequiredTotal += memoryLayerOutput
if memoryAvailable > memoryRequiredTotal { if memoryAvailable > memoryRequiredTotal {
layerCount = int(ggml.KV().BlockCount()) + 1 layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal memoryRequiredPartial = memoryRequiredTotal