Merge pull request #3478 from ollama/mxyng/tensor-layer

refactor tensor query
This commit is contained in:
Michael Yang 2024-04-10 12:45:03 -07:00 committed by GitHub
commit 5a25f93522
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 135 additions and 111 deletions

View file

@ -50,7 +50,7 @@ func HumanBytes(b int64) string {
} }
} }
func HumanBytes2(b int64) string { func HumanBytes2(b uint64) string {
switch { switch {
case b >= MebiByte: case b >= MebiByte:
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte) return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)

View file

@ -243,7 +243,7 @@ func getCPUMem() (memInfo, error) {
return ret, nil return ret, nil
} }
func CheckVRAM() (int64, error) { func CheckVRAM() (uint64, error) {
userLimit := os.Getenv("OLLAMA_MAX_VRAM") userLimit := os.Getenv("OLLAMA_MAX_VRAM")
if userLimit != "" { if userLimit != "" {
avail, err := strconv.ParseInt(userLimit, 10, 64) avail, err := strconv.ParseInt(userLimit, 10, 64)
@ -251,11 +251,11 @@ func CheckVRAM() (int64, error) {
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err) return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
} }
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail)) slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
return avail, nil return uint64(avail), nil
} }
gpuInfo := GetGPUInfo() gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
return int64(gpuInfo.FreeMemory), nil return gpuInfo.FreeMemory, nil
} }
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation

View file

@ -17,7 +17,7 @@ import (
) )
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) { func CheckVRAM() (uint64, error) {
userLimit := os.Getenv("OLLAMA_MAX_VRAM") userLimit := os.Getenv("OLLAMA_MAX_VRAM")
if userLimit != "" { if userLimit != "" {
avail, err := strconv.ParseInt(userLimit, 10, 64) avail, err := strconv.ParseInt(userLimit, 10, 64)
@ -25,15 +25,14 @@ func CheckVRAM() (int64, error) {
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err) return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
} }
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail)) slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
return avail, nil return uint64(avail), nil
} }
if runtime.GOARCH == "amd64" { if runtime.GOARCH == "amd64" {
// gpu not supported, this may not be metal // gpu not supported, this may not be metal
return 0, nil return 0, nil
} }
recommendedMaxVRAM := int64(C.getRecommendedMaxVRAM()) return uint64(C.getRecommendedMaxVRAM()), nil
return recommendedMaxVRAM, nil
} }
func GetGPUInfo() GpuInfo { func GetGPUInfo() GpuInfo {

View file

@ -15,7 +15,7 @@ type GpuInfo struct {
Variant string `json:"variant,omitempty"` Variant string `json:"variant,omitempty"`
// MinimumMemory represents the minimum memory required to use the GPU // MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory int64 `json:"-"` MinimumMemory uint64 `json:"-"`
// TODO add other useful attributes about the card here for discovery information // TODO add other useful attributes about the card here for discovery information
} }

View file

@ -49,7 +49,7 @@ func (llm *ggla) KV() KV {
return llm.kv return llm.kv
} }
func (llm *ggla) Tensors() []*Tensor { func (llm *ggla) Tensors() Tensors {
return llm.tensors return llm.tensors
} }

View file

@ -13,16 +13,6 @@ type GGML struct {
model model
} }
func (ggml *GGML) LayerSize(prefix string) (n int64) {
for _, t := range ggml.Tensors() {
if strings.HasPrefix(t.Name, prefix) {
n += int64(t.size())
}
}
return
}
const ( const (
fileTypeF32 uint32 = iota fileTypeF32 uint32 = iota
fileTypeF16 fileTypeF16
@ -101,7 +91,7 @@ func fileType(fileType uint32) string {
type model interface { type model interface {
KV() KV KV() KV
Tensors() []*Tensor Tensors() Tensors
} }
type KV map[string]any type KV map[string]any
@ -167,6 +157,36 @@ func (kv KV) ContextLength() uint64 {
return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture())) return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
} }
type Tensors []*Tensor
func (ts Tensors) Layers() map[string]Layer {
layers := make(map[string]Layer)
for _, t := range ts {
parts := strings.Split(t.Name, ".")
if parts[0] == "blk" {
parts = parts[1:]
}
if _, ok := layers[parts[0]]; !ok {
layers[parts[0]] = make(Layer)
}
layers[parts[0]][strings.Join(parts[1:], ".")] = t
}
return layers
}
type Layer map[string]*Tensor
func (l Layer) size() (size uint64) {
for _, t := range l {
size += t.size()
}
return size
}
type Tensor struct { type Tensor struct {
Name string `json:"name"` Name string `json:"name"`
Kind uint32 `json:"kind"` Kind uint32 `json:"kind"`
@ -304,49 +324,52 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
}, offset, nil }, offset, nil
} }
func (llm GGML) GraphSize(context, batch int) (int64, bool) { func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
embeddingLength := llm.KV().EmbeddingLength() embedding := llm.KV().EmbeddingLength()
headCount := llm.KV().HeadCount() heads := llm.KV().HeadCount()
headCountKV := llm.KV().HeadCountKV() headsKV := llm.KV().HeadCountKV()
vocabLength := len(llm.KV()["tokenizer.ggml.tokens"].([]any)) vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
var attnQKVWeight1 uint64 = 0
for _, t := range llm.Tensors() {
if strings.HasSuffix(t.Name, ".attn_qkv.weight") && len(t.Shape) >= 2 {
attnQKVWeight1 = t.Shape[1]
break
}
}
var ffnGate1 uint64 = 0
for _, t := range llm.Tensors() {
if strings.Index(t.Name, ".ffn_gate") > 0 && len(t.Shape) >= 2 {
ffnGate1 = t.Shape[1]
break
}
}
switch llm.KV().Architecture() { switch llm.KV().Architecture() {
case "gemma", "command-r":
return 4 * int64(batch) * int64(embeddingLength+uint64(vocabLength)), true
case "phi2":
return max(
4*int64(batch)*int64(embeddingLength+uint64(vocabLength)),
4*int64(batch)*int64(1+4*embeddingLength+uint64(context)+attnQKVWeight1+uint64(context)*headCount),
), true
case "qwen2":
return max(
4*int64(batch)*int64(embeddingLength+uint64(vocabLength)),
4*int64(batch)*int64(1+2*embeddingLength+uint64(context)+uint64(context)*headCount),
), true
case "llama": case "llama":
if ffnGate1 > 0 { fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
// moe
return 4 * int64(batch) * int64(2+3*embeddingLength+uint64(context)+uint64(context)*headCount+2*headCountKV+ffnGate1), true partialOffload = 4 * batch * embedding
} partialOffload += max(
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
return 4 * int64(batch) * int64(1+4*embeddingLength+uint64(context)+uint64(context)*headCount), true 4*batch*(embedding+vocab)+embedding*vocab*105/128,
)
case "gemma":
fullOffload = 4 * batch * (embedding + vocab)
partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
case "command-r":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(2+4*embedding+context*(1+heads)),
)
partialOffload = max(
4*batch*(embedding+vocab)+embedding*vocab*105/128,
4*batch*(1+2*embedding+context*(1+heads))+ 4*embedding*context+embedding*embedding*9/16,
)
case "qwen2":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(1+2*embedding+context+context*heads),
)
partialOffload = max(
4*batch*(embedding+vocab)+embedding*vocab*105/128,
4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
)
case "phi2":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(1+4*embedding+context+context*heads),
)
partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
} }
return 0, false return
} }

View file

@ -109,7 +109,7 @@ func (llm *gguf) KV() KV {
return llm.kv return llm.kv
} }
func (llm *gguf) Tensors() []*Tensor { func (llm *gguf) Tensors() Tensors {
return llm.tensors return llm.tensors
} }

View file

@ -41,10 +41,6 @@ var cpuOnlyFamilies = []string{
} }
func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) { func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
f, err := os.Open(model) f, err := os.Open(model)
if err != nil { if err != nil {
return nil, err return nil, err
@ -65,67 +61,79 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
opts.NumCtx = 4 opts.NumCtx = 4
} }
availableMemory, _ := gpu.CheckVRAM() memoryAvailable, _ := gpu.CheckVRAM()
info := gpu.GetGPUInfo() info := gpu.GetGPUInfo()
usedMemory := info.MinimumMemory memoryMinimum := info.MinimumMemory
for _, projector := range projectors { for _, projector := range projectors {
usedMemory += projectorMemoryRequirements(projector) memoryMinimum += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context // multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048) opts.NumCtx = max(opts.NumCtx, 2048)
} }
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV()) var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
graph, ok := ggml.GraphSize(opts.NumCtx, min(opts.NumCtx, opts.NumBatch)) graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
if !ok { if graphPartialOffload == 0 {
graph = int64(ggml.KV().GQA()) * kv / 6 graphPartialOffload = ggml.KV().GQA() * kv / 6
} }
usedMemory += graph if graphFullOffload == 0 {
graphFullOffload = graphPartialOffload
if (usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture())) && info.Library != "metal" {
info.Library = "cpu"
} }
requiredMemory := usedMemory // memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload
var layers int // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
for i := 0; i < int(ggml.KV().BlockCount()); i++ { memoryRequiredPartial := memoryMinimum + graphPartialOffload
layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
requiredMemory += layerMemory
if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) { if info.Library != "metal" {
usedMemory += layerMemory if memoryRequiredPartial > memoryAvailable || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
layers++ info.Library = "cpu"
} }
} }
memOutputLayer := ggml.LayerSize("output.") var layerCount int
requiredMemory += memOutputLayer layers := ggml.Tensors().Layers()
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
memoryLayer := layers[fmt.Sprintf("%d", i)].size()
// only offload output layer if all repeating layers are offloaded // KV is proportional to the number of layers
if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer { memoryLayer += kv / ggml.KV().BlockCount()
usedMemory += memOutputLayer
layers++ memoryRequiredTotal += memoryLayer
if memoryAvailable > memoryRequiredPartial+memoryLayer {
memoryRequiredPartial += memoryLayer
layerCount++
}
}
memoryLayerOutput := layers["output"].size()
memoryRequiredTotal += memoryLayerOutput
if memoryAvailable > memoryRequiredTotal {
layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal
}
if opts.NumGPU < 0 {
opts.NumGPU = layerCount
} }
slog.Info( slog.Info(
"offload to gpu", "offload to gpu",
"layers", layers, "reallayers", opts.NumGPU,
"required", format.HumanBytes2(requiredMemory), "layers", layerCount,
"used", format.HumanBytes2(usedMemory), "required", format.HumanBytes2(memoryRequiredTotal),
"available", format.HumanBytes2(availableMemory), "used", format.HumanBytes2(memoryRequiredPartial),
"available", format.HumanBytes2(memoryAvailable),
"kv", format.HumanBytes2(kv), "kv", format.HumanBytes2(kv),
"graph", format.HumanBytes2(graph), "fulloffload", format.HumanBytes2(graphFullOffload),
"partialoffload", format.HumanBytes2(graphPartialOffload),
) )
if opts.NumGPU < 0 && info.Library != "cpu" {
opts.NumGPU = layers
}
if len(adapters) > 1 { if len(adapters) > 1 {
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
} }
@ -282,7 +290,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
return nil, finalErr return nil, finalErr
} }
func projectorMemoryRequirements(filename string) int64 { func projectorMemoryRequirements(filename string) uint64 {
file, err := os.Open(filename) file, err := os.Open(filename)
if err != nil { if err != nil {
return 0 return 0
@ -294,18 +302,12 @@ func projectorMemoryRequirements(filename string) int64 {
return 0 return 0
} }
prefixes := make(map[string]struct{}) var mem uint64
for _, layer := range ggml.Tensors() { for _, layer := range ggml.Tensors().Layers() {
parts := strings.Split(layer.Name, ".") mem += layer.size()
prefixes[strings.Join(parts[:2], ".")] = struct{}{}
} }
var ask int64 return mem
for prefix := range prefixes {
ask += ggml.LayerSize(prefix)
}
return ask
} }
type ServerStatus int type ServerStatus int