refactor tensor query

2024-04-03 15:00:31 -07:00 · 2024-04-03 15:00:31 -07:00 · 8b2c10061c
commit 8b2c10061c
parent c5c451ca3b
4 changed files with 54 additions and 42 deletions
--- a/llm/ggla.go
+++ b/llm/ggla.go
@ -49,7 +49,7 @@ func (llm *ggla) KV() KV {
 	return llm.kv
 }

-func (llm *ggla) Tensors() []*Tensor {
+func (llm *ggla) Tensors() Tensors {
 	return llm.tensors
 }

--- a/llm/ggml.go
+++ b/llm/ggml.go
@ -13,16 +13,6 @@ type GGML struct {
 	model
 }

-func (ggml *GGML) LayerSize(prefix string) (n int64) {
-	for _, t := range ggml.Tensors() {
-		if strings.HasPrefix(t.Name, prefix) {
-			n += int64(t.size())
-		}
-	}
-
-	return
-}
-
 const (
 	fileTypeF32 uint32 = iota
 	fileTypeF16
@ -101,7 +91,7 @@ func fileType(fileType uint32) string {

 type model interface {
 	KV() KV
-	Tensors() []*Tensor
+	Tensors() Tensors
 }

 type KV map[string]any
@ -167,6 +157,36 @@ func (kv KV) ContextLength() uint64 {
 	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
 }

+type Tensors []*Tensor
+
+func (ts Tensors) Layers() map[string]Layer {
+	layers := make(map[string]Layer)
+	for _, t := range ts {
+		parts := strings.Split(t.Name, ".")
+		if parts[0] == "blk" {
+			parts = parts[1:]
+		}
+
+		if _, ok := layers[parts[0]]; !ok {
+			layers[parts[0]] = make(Layer)
+		}
+
+		layers[parts[0]][strings.Join(parts[1:], ".")] = t
+	}
+
+	return layers
+}
+
+type Layer map[string]*Tensor
+
+func (l Layer) size() (size uint64) {
+	for _, t := range l {
+		size += t.size()
+	}
+
+	return size
+}
+
 type Tensor struct {
 	Name   string `json:"name"`
 	Kind   uint32 `json:"kind"`
@ -310,20 +330,16 @@ func (llm GGML) GraphSize(context, batch int) (int64, bool) {
 	headCountKV := llm.KV().HeadCountKV()
 	vocabLength := len(llm.KV()["tokenizer.ggml.tokens"].([]any))

+	layers := llm.Tensors().Layers()
+
 	var attnQKVWeight1 uint64 = 0
-	for _, t := range llm.Tensors() {
-		if strings.HasSuffix(t.Name, ".attn_qkv.weight") && len(t.Shape) >= 2 {
+	if t, ok := layers["0"]["attn_qkv.weight"]; ok && len(t.Shape) > 2 {
 		attnQKVWeight1 = t.Shape[1]
-			break
-		}
 	}

-	var ffnGate1 uint64 = 0
-	for _, t := range llm.Tensors() {
-		if strings.Index(t.Name, ".ffn_gate") > 0 && len(t.Shape) >= 2 {
-			ffnGate1 = t.Shape[1]
-			break
-		}
+	var ffnGate0Weight1 uint64 = 0
+	if t, ok := layers["0"]["ffn_gate.0.weight"]; ok && len(t.Shape) > 2 {
+		ffnGate0Weight1 = t.Shape[1]
 	}

 	switch llm.KV().Architecture() {
@ -340,9 +356,9 @@ func (llm GGML) GraphSize(context, batch int) (int64, bool) {
 			4*int64(batch)*int64(1+2*embeddingLength+uint64(context)+uint64(context)*headCount),
 		), true
 	case "llama":
-		if ffnGate1 > 0 {
+		if ffnGate0Weight1 > 0 {
 			// moe
-			return 4 * int64(batch) * int64(2+3*embeddingLength+uint64(context)+uint64(context)*headCount+2*headCountKV+ffnGate1), true
+			return 4 * int64(batch) * int64(2+3*embeddingLength+uint64(context)+uint64(context)*headCount+2*headCountKV+ffnGate0Weight1), true
 		}

 		return 4 * int64(batch) * int64(1+4*embeddingLength+uint64(context)+uint64(context)*headCount), true
--- a/llm/gguf.go
+++ b/llm/gguf.go
@ -109,7 +109,7 @@ func (llm *gguf) KV() KV {
 	return llm.kv
 }

-func (llm *gguf) Tensors() []*Tensor {
+func (llm *gguf) Tensors() Tensors {
 	return llm.tensors
 }

--- a/llm/server.go
+++ b/llm/server.go
@ -77,11 +77,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 	}

 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
-	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
+	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()

 	graph, ok := ggml.GraphSize(opts.NumCtx, min(opts.NumCtx, opts.NumBatch))
 	if !ok {
-		graph = int64(ggml.KV().GQA()) * kv / 6
+		graph = int64(ggml.KV().GQA()*kv) / 6
 	}

 	usedMemory += graph
@ -92,9 +92,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option

 	requiredMemory := usedMemory

+	tensorLayers := ggml.Tensors().Layers()
+
 	var layers int
 	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
-		layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
+		layerMemory := int64(tensorLayers[fmt.Sprintf("%d", i)].size() + kv/ggml.KV().BlockCount())
 		requiredMemory += layerMemory

 		if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
@ -103,7 +105,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 		}
 	}

-	memOutputLayer := ggml.LayerSize("output.")
+	memOutputLayer := int64(tensorLayers["output"].size())
 	requiredMemory += memOutputLayer

 	// only offload output layer if all repeating layers are offloaded
@ -118,7 +120,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 		"required", format.HumanBytes2(requiredMemory),
 		"used", format.HumanBytes2(usedMemory),
 		"available", format.HumanBytes2(availableMemory),
-		"kv", format.HumanBytes2(kv),
+		"kv", format.HumanBytes2(int64(kv)),
 		"graph", format.HumanBytes2(graph),
 	)

@ -294,18 +296,12 @@ func projectorMemoryRequirements(filename string) int64 {
 		return 0
 	}

-	prefixes := make(map[string]struct{})
-	for _, layer := range ggml.Tensors() {
-		parts := strings.Split(layer.Name, ".")
-		prefixes[strings.Join(parts[:2], ".")] = struct{}{}
+	var mem uint64
+	for _, layer := range ggml.Tensors().Layers() {
+		mem += layer.size()
 	}

-	var ask int64
-	for prefix := range prefixes {
-		ask += ggml.LayerSize(prefix)
-	}
-
-	return ask
+	return int64(mem)
 }

 type ServerStatus int