partial offloading

This commit is contained in:
Michael Yang 2024-04-05 14:50:38 -07:00
parent 8b2c10061c
commit 7e33a017c0
6 changed files with 98 additions and 86 deletions

View file

@ -50,7 +50,7 @@ func HumanBytes(b int64) string {
} }
} }
func HumanBytes2(b int64) string { func HumanBytes2(b uint64) string {
switch { switch {
case b >= MebiByte: case b >= MebiByte:
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte) return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)

View file

@ -243,7 +243,7 @@ func getCPUMem() (memInfo, error) {
return ret, nil return ret, nil
} }
func CheckVRAM() (int64, error) { func CheckVRAM() (uint64, error) {
userLimit := os.Getenv("OLLAMA_MAX_VRAM") userLimit := os.Getenv("OLLAMA_MAX_VRAM")
if userLimit != "" { if userLimit != "" {
avail, err := strconv.ParseInt(userLimit, 10, 64) avail, err := strconv.ParseInt(userLimit, 10, 64)
@ -251,11 +251,11 @@ func CheckVRAM() (int64, error) {
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err) return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
} }
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail)) slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
return avail, nil return uint64(avail), nil
} }
gpuInfo := GetGPUInfo() gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
return int64(gpuInfo.FreeMemory), nil return gpuInfo.FreeMemory, nil
} }
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation

View file

@ -17,7 +17,7 @@ import (
) )
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) { func CheckVRAM() (uint64, error) {
userLimit := os.Getenv("OLLAMA_MAX_VRAM") userLimit := os.Getenv("OLLAMA_MAX_VRAM")
if userLimit != "" { if userLimit != "" {
avail, err := strconv.ParseInt(userLimit, 10, 64) avail, err := strconv.ParseInt(userLimit, 10, 64)
@ -25,15 +25,14 @@ func CheckVRAM() (int64, error) {
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err) return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
} }
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail)) slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
return avail, nil return uint64(avail), nil
} }
if runtime.GOARCH == "amd64" { if runtime.GOARCH == "amd64" {
// gpu not supported, this may not be metal // gpu not supported, this may not be metal
return 0, nil return 0, nil
} }
recommendedMaxVRAM := int64(C.getRecommendedMaxVRAM()) return uint64(C.getRecommendedMaxVRAM()), nil
return recommendedMaxVRAM, nil
} }
func GetGPUInfo() GpuInfo { func GetGPUInfo() GpuInfo {

View file

@ -15,7 +15,7 @@ type GpuInfo struct {
Variant string `json:"variant,omitempty"` Variant string `json:"variant,omitempty"`
// MinimumMemory represents the minimum memory required to use the GPU // MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory int64 `json:"-"` MinimumMemory uint64 `json:"-"`
// TODO add other useful attributes about the card here for discovery information // TODO add other useful attributes about the card here for discovery information
} }

View file

@ -324,45 +324,52 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
}, offset, nil }, offset, nil
} }
func (llm GGML) GraphSize(context, batch int) (int64, bool) { func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
embeddingLength := llm.KV().EmbeddingLength() embedding := llm.KV().EmbeddingLength()
headCount := llm.KV().HeadCount() heads := llm.KV().HeadCount()
headCountKV := llm.KV().HeadCountKV() headsKV := llm.KV().HeadCountKV()
vocabLength := len(llm.KV()["tokenizer.ggml.tokens"].([]any)) vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
layers := llm.Tensors().Layers()
var attnQKVWeight1 uint64 = 0
if t, ok := layers["0"]["attn_qkv.weight"]; ok && len(t.Shape) > 2 {
attnQKVWeight1 = t.Shape[1]
}
var ffnGate0Weight1 uint64 = 0
if t, ok := layers["0"]["ffn_gate.0.weight"]; ok && len(t.Shape) > 2 {
ffnGate0Weight1 = t.Shape[1]
}
switch llm.KV().Architecture() { switch llm.KV().Architecture() {
case "gemma", "command-r":
return 4 * int64(batch) * int64(embeddingLength+uint64(vocabLength)), true
case "phi2":
return max(
4*int64(batch)*int64(embeddingLength+uint64(vocabLength)),
4*int64(batch)*int64(1+4*embeddingLength+uint64(context)+attnQKVWeight1+uint64(context)*headCount),
), true
case "qwen2":
return max(
4*int64(batch)*int64(embeddingLength+uint64(vocabLength)),
4*int64(batch)*int64(1+2*embeddingLength+uint64(context)+uint64(context)*headCount),
), true
case "llama": case "llama":
if ffnGate0Weight1 > 0 { fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
// moe
return 4 * int64(batch) * int64(2+3*embeddingLength+uint64(context)+uint64(context)*headCount+2*headCountKV+ffnGate0Weight1), true partialOffload = 4 * batch * embedding
partialOffload += max(
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
4*batch*(embedding+vocab)+embedding*vocab*105/128,
)
case "gemma":
fullOffload = 4 * batch * (embedding + vocab)
partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
case "command-r":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(2+4*embedding+context*(1+heads)),
)
partialOffload = max(
4*batch*(embedding+vocab)+embedding*vocab*105/128,
4*batch*(1+2*embedding+context*(1+heads))+ 4*embedding*context+embedding*embedding*9/16,
)
case "qwen2":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(1+2*embedding+context+context*heads),
)
partialOffload = max(
4*batch*(embedding+vocab)+embedding*vocab*105/128,
4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
)
case "phi2":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(1+4*embedding+context+context*heads),
)
partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
} }
return 4 * int64(batch) * int64(1+4*embeddingLength+uint64(context)+uint64(context)*headCount), true return
}
return 0, false
} }

View file

@ -41,10 +41,6 @@ var cpuOnlyFamilies = []string{
} }
func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) { func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
f, err := os.Open(model) f, err := os.Open(model)
if err != nil { if err != nil {
return nil, err return nil, err
@ -65,12 +61,12 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
opts.NumCtx = 4 opts.NumCtx = 4
} }
availableMemory, _ := gpu.CheckVRAM() memoryAvailable, _ := gpu.CheckVRAM()
info := gpu.GetGPUInfo() info := gpu.GetGPUInfo()
usedMemory := info.MinimumMemory memoryMinimum := info.MinimumMemory
for _, projector := range projectors { for _, projector := range projectors {
usedMemory += projectorMemoryRequirements(projector) memoryMinimum += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context // multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048) opts.NumCtx = max(opts.NumCtx, 2048)
@ -79,55 +75,65 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
graph, ok := ggml.GraphSize(opts.NumCtx, min(opts.NumCtx, opts.NumBatch)) graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
if !ok { if graphPartialOffload == 0 {
graph = int64(ggml.KV().GQA()*kv) / 6 graphPartialOffload = ggml.KV().GQA() * kv / 6
} }
usedMemory += graph if graphFullOffload == 0 {
graphFullOffload = graphPartialOffload
}
if (usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture())) && info.Library != "metal" { // memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload
if info.Library != "metal" {
if memoryRequiredPartial > memoryAvailable || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
info.Library = "cpu" info.Library = "cpu"
} }
}
requiredMemory := usedMemory var layerCount int
layers := ggml.Tensors().Layers()
tensorLayers := ggml.Tensors().Layers()
var layers int
for i := 0; i < int(ggml.KV().BlockCount()); i++ { for i := 0; i < int(ggml.KV().BlockCount()); i++ {
layerMemory := int64(tensorLayers[fmt.Sprintf("%d", i)].size() + kv/ggml.KV().BlockCount()) memoryLayer := layers[fmt.Sprintf("%d", i)].size()
requiredMemory += layerMemory
if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) { // KV is proportional to the number of layers
usedMemory += layerMemory memoryLayer += kv / ggml.KV().BlockCount()
layers++
memoryRequiredTotal += memoryLayer
if memoryAvailable > memoryRequiredPartial+memoryLayer {
memoryRequiredPartial += memoryLayer
layerCount++
} }
} }
memOutputLayer := int64(tensorLayers["output"].size()) memoryLayerOutput := layers["output"].size()
requiredMemory += memOutputLayer memoryRequiredTotal += memoryLayerOutput
if memoryAvailable > memoryRequiredTotal {
layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal
}
// only offload output layer if all repeating layers are offloaded if opts.NumGPU < 0 {
if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer { opts.NumGPU = layerCount
usedMemory += memOutputLayer
layers++
} }
slog.Info( slog.Info(
"offload to gpu", "offload to gpu",
"layers", layers, "reallayers", opts.NumGPU,
"required", format.HumanBytes2(requiredMemory), "layers", layerCount,
"used", format.HumanBytes2(usedMemory), "required", format.HumanBytes2(memoryRequiredTotal),
"available", format.HumanBytes2(availableMemory), "used", format.HumanBytes2(memoryRequiredPartial),
"kv", format.HumanBytes2(int64(kv)), "available", format.HumanBytes2(memoryAvailable),
"graph", format.HumanBytes2(graph), "kv", format.HumanBytes2(kv),
"fulloffload", format.HumanBytes2(graphFullOffload),
"partialoffload", format.HumanBytes2(graphPartialOffload),
) )
if opts.NumGPU < 0 && info.Library != "cpu" {
opts.NumGPU = layers
}
if len(adapters) > 1 { if len(adapters) > 1 {
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
} }
@ -284,7 +290,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
return nil, finalErr return nil, finalErr
} }
func projectorMemoryRequirements(filename string) int64 { func projectorMemoryRequirements(filename string) uint64 {
file, err := os.Open(filename) file, err := os.Open(filename)
if err != nil { if err != nil {
return 0 return 0
@ -301,7 +307,7 @@ func projectorMemoryRequirements(filename string) int64 {
mem += layer.size() mem += layer.size()
} }
return int64(mem) return mem
} }
type ServerStatus int type ServerStatus int