diff --git a/llm/ggml.go b/llm/ggml.go index 9cf9172e..47ec24a1 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -360,7 +360,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) { }, offset, nil } -func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) { +func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffload uint64) { embedding := llm.KV().EmbeddingLength() heads := llm.KV().HeadCount() headsKV := llm.KV().HeadCountKV() @@ -368,9 +368,12 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui embeddingHeads := llm.KV().EmbeddingHeadCount() embeddingHeadsK := llm.KV().EmbeddingHeadCountK() + embeddingHeadsV := llm.KV().EmbeddingHeadCountV() layers := llm.Tensors().Layers() + kv = 2 * context * llm.KV().BlockCount() * (embeddingHeadsK + embeddingHeadsV) * headsKV + switch llm.KV().Architecture() { case "llama": fullOffload = max( @@ -403,6 +406,18 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui case "mllama": var visionTokens, tiles uint64 = 1601, 4 + if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok { + kv = headsKV * + (embeddingHeadsK + embeddingHeadsV) * // one for K, one for V + (2* // sizeof(float16) + (llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers + context + + 4* // sizeof(float32) + uint64(crossAttentionLayers.size)* // num cross attention layers + visionTokens* + tiles) + } + fullOffload = max( 4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)), // vocab graph diff --git a/llm/memory.go b/llm/memory.go index 16f9a743..521ed16f 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -123,13 +123,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, slog.Warn("model missing blk.0 layer size") } - // fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv - var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV() - - // KV is proportional to the number of layers - layerSize += kv / ggml.KV().BlockCount() - - graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch))) + kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch))) if graphPartialOffload == 0 { graphPartialOffload = ggml.KV().GQA() * kv / 6 } @@ -137,6 +131,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, graphFullOffload = graphPartialOffload } + // KV is proportional to the number of layers + layerSize += kv / ggml.KV().BlockCount() + // on metal there's no partial offload overhead if gpus[0].Library == "metal" { graphPartialOffload = graphFullOffload