diff --git a/llm/ggla.go b/llm/ggla.go index 591d5e00..cf14f214 100644 --- a/llm/ggla.go +++ b/llm/ggla.go @@ -49,7 +49,7 @@ func (llm *ggla) KV() KV { return llm.kv } -func (llm *ggla) Tensors() []*Tensor { +func (llm *ggla) Tensors() Tensors { return llm.tensors } diff --git a/llm/ggml.go b/llm/ggml.go index 071a36c3..c1fdd0d1 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -13,16 +13,6 @@ type GGML struct { model } -func (ggml *GGML) LayerSize(prefix string) (n int64) { - for _, t := range ggml.Tensors() { - if strings.HasPrefix(t.Name, prefix) { - n += int64(t.size()) - } - } - - return -} - const ( fileTypeF32 uint32 = iota fileTypeF16 @@ -101,7 +91,7 @@ func fileType(fileType uint32) string { type model interface { KV() KV - Tensors() []*Tensor + Tensors() Tensors } type KV map[string]any @@ -167,6 +157,36 @@ func (kv KV) ContextLength() uint64 { return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture())) } +type Tensors []*Tensor + +func (ts Tensors) Layers() map[string]Layer { + layers := make(map[string]Layer) + for _, t := range ts { + parts := strings.Split(t.Name, ".") + if parts[0] == "blk" { + parts = parts[1:] + } + + if _, ok := layers[parts[0]]; !ok { + layers[parts[0]] = make(Layer) + } + + layers[parts[0]][strings.Join(parts[1:], ".")] = t + } + + return layers +} + +type Layer map[string]*Tensor + +func (l Layer) size() (size uint64) { + for _, t := range l { + size += t.size() + } + + return size +} + type Tensor struct { Name string `json:"name"` Kind uint32 `json:"kind"` @@ -310,20 +330,16 @@ func (llm GGML) GraphSize(context, batch int) (int64, bool) { headCountKV := llm.KV().HeadCountKV() vocabLength := len(llm.KV()["tokenizer.ggml.tokens"].([]any)) + layers := llm.Tensors().Layers() + var attnQKVWeight1 uint64 = 0 - for _, t := range llm.Tensors() { - if strings.HasSuffix(t.Name, ".attn_qkv.weight") && len(t.Shape) >= 2 { - attnQKVWeight1 = t.Shape[1] - break - } + if t, ok := layers["0"]["attn_qkv.weight"]; ok && len(t.Shape) > 2 { + attnQKVWeight1 = t.Shape[1] } - var ffnGate1 uint64 = 0 - for _, t := range llm.Tensors() { - if strings.Index(t.Name, ".ffn_gate") > 0 && len(t.Shape) >= 2 { - ffnGate1 = t.Shape[1] - break - } + var ffnGate0Weight1 uint64 = 0 + if t, ok := layers["0"]["ffn_gate.0.weight"]; ok && len(t.Shape) > 2 { + ffnGate0Weight1 = t.Shape[1] } switch llm.KV().Architecture() { @@ -340,11 +356,11 @@ func (llm GGML) GraphSize(context, batch int) (int64, bool) { 4*int64(batch)*int64(1+2*embeddingLength+uint64(context)+uint64(context)*headCount), ), true case "llama": - if ffnGate1 > 0 { + if ffnGate0Weight1 > 0 { // moe - return 4 * int64(batch) * int64(2+3*embeddingLength+uint64(context)+uint64(context)*headCount+2*headCountKV+ffnGate1), true + return 4 * int64(batch) * int64(2+3*embeddingLength+uint64(context)+uint64(context)*headCount+2*headCountKV+ffnGate0Weight1), true } - + return 4 * int64(batch) * int64(1+4*embeddingLength+uint64(context)+uint64(context)*headCount), true } diff --git a/llm/gguf.go b/llm/gguf.go index 7d804712..796642e3 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -109,7 +109,7 @@ func (llm *gguf) KV() KV { return llm.kv } -func (llm *gguf) Tensors() []*Tensor { +func (llm *gguf) Tensors() Tensors { return llm.tensors } diff --git a/llm/server.go b/llm/server.go index 160effe7..19d8a061 100644 --- a/llm/server.go +++ b/llm/server.go @@ -77,11 +77,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option } // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv - kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV()) + var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() graph, ok := ggml.GraphSize(opts.NumCtx, min(opts.NumCtx, opts.NumBatch)) if !ok { - graph = int64(ggml.KV().GQA()) * kv / 6 + graph = int64(ggml.KV().GQA()*kv) / 6 } usedMemory += graph @@ -92,9 +92,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option requiredMemory := usedMemory + tensorLayers := ggml.Tensors().Layers() + var layers int for i := 0; i < int(ggml.KV().BlockCount()); i++ { - layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount()) + layerMemory := int64(tensorLayers[fmt.Sprintf("%d", i)].size() + kv/ggml.KV().BlockCount()) requiredMemory += layerMemory if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) { @@ -103,7 +105,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option } } - memOutputLayer := ggml.LayerSize("output.") + memOutputLayer := int64(tensorLayers["output"].size()) requiredMemory += memOutputLayer // only offload output layer if all repeating layers are offloaded @@ -118,7 +120,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option "required", format.HumanBytes2(requiredMemory), "used", format.HumanBytes2(usedMemory), "available", format.HumanBytes2(availableMemory), - "kv", format.HumanBytes2(kv), + "kv", format.HumanBytes2(int64(kv)), "graph", format.HumanBytes2(graph), ) @@ -294,18 +296,12 @@ func projectorMemoryRequirements(filename string) int64 { return 0 } - prefixes := make(map[string]struct{}) - for _, layer := range ggml.Tensors() { - parts := strings.Split(layer.Name, ".") - prefixes[strings.Join(parts[:2], ".")] = struct{}{} + var mem uint64 + for _, layer := range ggml.Tensors().Layers() { + mem += layer.size() } - var ask int64 - for prefix := range prefixes { - ask += ggml.LayerSize(prefix) - } - - return ask + return int64(mem) } type ServerStatus int