count memory up to NumGPU

This commit is contained in:
Michael Yang 2024-05-10 14:40:37 -07:00
parent 9c76b30d72
commit 50b9056e09

View file

@ -53,6 +53,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
opts.NumCtx = max(opts.NumCtx, 2048) opts.NumCtx = max(opts.NumCtx, 2048)
} }
layers := ggml.Tensors().Layers()
// add one layer worth of memorr as a buffer
memoryMinimum += layers["blk.0"].size()
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
@ -73,13 +77,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
graphPartialOffload = graphFullOffload graphPartialOffload = graphFullOffload
} }
layers := ggml.Tensors().Layers()
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers) // memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size() memoryRequiredTotal := memoryMinimum + graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers) // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size() memoryRequiredPartial := memoryMinimum + graphPartialOffload
var memoryLayerOutput uint64 var memoryLayerOutput uint64
if layer, ok := layers["output_norm"]; ok { if layer, ok := layers["output_norm"]; ok {
@ -106,7 +108,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
memoryLayer += kv / ggml.KV().BlockCount() memoryLayer += kv / ggml.KV().BlockCount()
memoryRequiredTotal += memoryLayer memoryRequiredTotal += memoryLayer
if memoryAvailable > memoryRequiredPartial+memoryLayer { if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
memoryRequiredPartial += memoryLayer memoryRequiredPartial += memoryLayer
layerCount++ layerCount++
} }
@ -117,7 +119,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
memoryRequiredTotal += memoryLayerOutput memoryRequiredTotal += memoryLayerOutput
} }
if memoryAvailable > memoryRequiredTotal { if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
layerCount = int(ggml.KV().BlockCount()) + 1 layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal memoryRequiredPartial = memoryRequiredTotal
} }
@ -128,10 +130,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
"offload to gpu", "offload to gpu",
slog.Group( slog.Group(
"layers", "layers",
// actual number of layers offloaded // requested number of layers to offload
"real", opts.NumGPU, "requested", opts.NumGPU,
// estimated number of layers that can be offloaded // estimated number of layers that can be offloaded
"estimate", layerCount, "real", layerCount,
), ),
slog.Group( slog.Group(
"memory", "memory",