refactor tensor query
This commit is contained in:
parent
c5c451ca3b
commit
8b2c10061c
4 changed files with 54 additions and 42 deletions
|
@ -49,7 +49,7 @@ func (llm *ggla) KV() KV {
|
||||||
return llm.kv
|
return llm.kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *ggla) Tensors() []*Tensor {
|
func (llm *ggla) Tensors() Tensors {
|
||||||
return llm.tensors
|
return llm.tensors
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
62
llm/ggml.go
62
llm/ggml.go
|
@ -13,16 +13,6 @@ type GGML struct {
|
||||||
model
|
model
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ggml *GGML) LayerSize(prefix string) (n int64) {
|
|
||||||
for _, t := range ggml.Tensors() {
|
|
||||||
if strings.HasPrefix(t.Name, prefix) {
|
|
||||||
n += int64(t.size())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
const (
|
const (
|
||||||
fileTypeF32 uint32 = iota
|
fileTypeF32 uint32 = iota
|
||||||
fileTypeF16
|
fileTypeF16
|
||||||
|
@ -101,7 +91,7 @@ func fileType(fileType uint32) string {
|
||||||
|
|
||||||
type model interface {
|
type model interface {
|
||||||
KV() KV
|
KV() KV
|
||||||
Tensors() []*Tensor
|
Tensors() Tensors
|
||||||
}
|
}
|
||||||
|
|
||||||
type KV map[string]any
|
type KV map[string]any
|
||||||
|
@ -167,6 +157,36 @@ func (kv KV) ContextLength() uint64 {
|
||||||
return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
|
return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type Tensors []*Tensor
|
||||||
|
|
||||||
|
func (ts Tensors) Layers() map[string]Layer {
|
||||||
|
layers := make(map[string]Layer)
|
||||||
|
for _, t := range ts {
|
||||||
|
parts := strings.Split(t.Name, ".")
|
||||||
|
if parts[0] == "blk" {
|
||||||
|
parts = parts[1:]
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, ok := layers[parts[0]]; !ok {
|
||||||
|
layers[parts[0]] = make(Layer)
|
||||||
|
}
|
||||||
|
|
||||||
|
layers[parts[0]][strings.Join(parts[1:], ".")] = t
|
||||||
|
}
|
||||||
|
|
||||||
|
return layers
|
||||||
|
}
|
||||||
|
|
||||||
|
type Layer map[string]*Tensor
|
||||||
|
|
||||||
|
func (l Layer) size() (size uint64) {
|
||||||
|
for _, t := range l {
|
||||||
|
size += t.size()
|
||||||
|
}
|
||||||
|
|
||||||
|
return size
|
||||||
|
}
|
||||||
|
|
||||||
type Tensor struct {
|
type Tensor struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Kind uint32 `json:"kind"`
|
Kind uint32 `json:"kind"`
|
||||||
|
@ -310,20 +330,16 @@ func (llm GGML) GraphSize(context, batch int) (int64, bool) {
|
||||||
headCountKV := llm.KV().HeadCountKV()
|
headCountKV := llm.KV().HeadCountKV()
|
||||||
vocabLength := len(llm.KV()["tokenizer.ggml.tokens"].([]any))
|
vocabLength := len(llm.KV()["tokenizer.ggml.tokens"].([]any))
|
||||||
|
|
||||||
|
layers := llm.Tensors().Layers()
|
||||||
|
|
||||||
var attnQKVWeight1 uint64 = 0
|
var attnQKVWeight1 uint64 = 0
|
||||||
for _, t := range llm.Tensors() {
|
if t, ok := layers["0"]["attn_qkv.weight"]; ok && len(t.Shape) > 2 {
|
||||||
if strings.HasSuffix(t.Name, ".attn_qkv.weight") && len(t.Shape) >= 2 {
|
|
||||||
attnQKVWeight1 = t.Shape[1]
|
attnQKVWeight1 = t.Shape[1]
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var ffnGate1 uint64 = 0
|
var ffnGate0Weight1 uint64 = 0
|
||||||
for _, t := range llm.Tensors() {
|
if t, ok := layers["0"]["ffn_gate.0.weight"]; ok && len(t.Shape) > 2 {
|
||||||
if strings.Index(t.Name, ".ffn_gate") > 0 && len(t.Shape) >= 2 {
|
ffnGate0Weight1 = t.Shape[1]
|
||||||
ffnGate1 = t.Shape[1]
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
switch llm.KV().Architecture() {
|
switch llm.KV().Architecture() {
|
||||||
|
@ -340,9 +356,9 @@ func (llm GGML) GraphSize(context, batch int) (int64, bool) {
|
||||||
4*int64(batch)*int64(1+2*embeddingLength+uint64(context)+uint64(context)*headCount),
|
4*int64(batch)*int64(1+2*embeddingLength+uint64(context)+uint64(context)*headCount),
|
||||||
), true
|
), true
|
||||||
case "llama":
|
case "llama":
|
||||||
if ffnGate1 > 0 {
|
if ffnGate0Weight1 > 0 {
|
||||||
// moe
|
// moe
|
||||||
return 4 * int64(batch) * int64(2+3*embeddingLength+uint64(context)+uint64(context)*headCount+2*headCountKV+ffnGate1), true
|
return 4 * int64(batch) * int64(2+3*embeddingLength+uint64(context)+uint64(context)*headCount+2*headCountKV+ffnGate0Weight1), true
|
||||||
}
|
}
|
||||||
|
|
||||||
return 4 * int64(batch) * int64(1+4*embeddingLength+uint64(context)+uint64(context)*headCount), true
|
return 4 * int64(batch) * int64(1+4*embeddingLength+uint64(context)+uint64(context)*headCount), true
|
||||||
|
|
|
@ -109,7 +109,7 @@ func (llm *gguf) KV() KV {
|
||||||
return llm.kv
|
return llm.kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *gguf) Tensors() []*Tensor {
|
func (llm *gguf) Tensors() Tensors {
|
||||||
return llm.tensors
|
return llm.tensors
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -77,11 +77,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||||
}
|
}
|
||||||
|
|
||||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
||||||
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
|
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
||||||
|
|
||||||
graph, ok := ggml.GraphSize(opts.NumCtx, min(opts.NumCtx, opts.NumBatch))
|
graph, ok := ggml.GraphSize(opts.NumCtx, min(opts.NumCtx, opts.NumBatch))
|
||||||
if !ok {
|
if !ok {
|
||||||
graph = int64(ggml.KV().GQA()) * kv / 6
|
graph = int64(ggml.KV().GQA()*kv) / 6
|
||||||
}
|
}
|
||||||
|
|
||||||
usedMemory += graph
|
usedMemory += graph
|
||||||
|
@ -92,9 +92,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||||
|
|
||||||
requiredMemory := usedMemory
|
requiredMemory := usedMemory
|
||||||
|
|
||||||
|
tensorLayers := ggml.Tensors().Layers()
|
||||||
|
|
||||||
var layers int
|
var layers int
|
||||||
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
||||||
layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
|
layerMemory := int64(tensorLayers[fmt.Sprintf("%d", i)].size() + kv/ggml.KV().BlockCount())
|
||||||
requiredMemory += layerMemory
|
requiredMemory += layerMemory
|
||||||
|
|
||||||
if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
|
if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
|
||||||
|
@ -103,7 +105,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
memOutputLayer := ggml.LayerSize("output.")
|
memOutputLayer := int64(tensorLayers["output"].size())
|
||||||
requiredMemory += memOutputLayer
|
requiredMemory += memOutputLayer
|
||||||
|
|
||||||
// only offload output layer if all repeating layers are offloaded
|
// only offload output layer if all repeating layers are offloaded
|
||||||
|
@ -118,7 +120,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||||
"required", format.HumanBytes2(requiredMemory),
|
"required", format.HumanBytes2(requiredMemory),
|
||||||
"used", format.HumanBytes2(usedMemory),
|
"used", format.HumanBytes2(usedMemory),
|
||||||
"available", format.HumanBytes2(availableMemory),
|
"available", format.HumanBytes2(availableMemory),
|
||||||
"kv", format.HumanBytes2(kv),
|
"kv", format.HumanBytes2(int64(kv)),
|
||||||
"graph", format.HumanBytes2(graph),
|
"graph", format.HumanBytes2(graph),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -294,18 +296,12 @@ func projectorMemoryRequirements(filename string) int64 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
prefixes := make(map[string]struct{})
|
var mem uint64
|
||||||
for _, layer := range ggml.Tensors() {
|
for _, layer := range ggml.Tensors().Layers() {
|
||||||
parts := strings.Split(layer.Name, ".")
|
mem += layer.size()
|
||||||
prefixes[strings.Join(parts[:2], ".")] = struct{}{}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var ask int64
|
return int64(mem)
|
||||||
for prefix := range prefixes {
|
|
||||||
ask += ggml.LayerSize(prefix)
|
|
||||||
}
|
|
||||||
|
|
||||||
return ask
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type ServerStatus int
|
type ServerStatus int
|
||||||
|
|
Loading…
Reference in a new issue