diff --git a/llm/llm.go b/llm/llm.go index 940c0d93..f7ed09e2 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -2,7 +2,6 @@ package llm import ( "context" - "fmt" "log" "os" "runtime" @@ -41,94 +40,76 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) opts.NumCtx = 4 } - fmt.Println("size", ggml.Size) - fmt.Println("filetype", ggml.FileType()) - fmt.Println("architecture", ggml.ModelFamily()) - fmt.Println("type", ggml.ModelType()) - fmt.Println("name", ggml.Name()) - fmt.Println("embd", ggml.NumEmbed()) - fmt.Println("head", ggml.NumHead()) - fmt.Println("head_kv", ggml.NumHeadKv()) - fmt.Println("gqa", ggml.NumGQA()) - - available, _ := gpu.CheckVRAM() - - // For now assume filesize = model size - // TODO: use actual model size - requiredModel := ggml.Size + vram, _ := gpu.CheckVRAM() + size := ggml.Size // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value - requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead()) + kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead()) // this amount is the overhead + tensors in memory // TODO: get this from the llama.cpp's graph calcluations instead of // estimating it's 1/6 * kv_cache_size * num_gqa - requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6 - - requiredTotal := requiredModel + requiredKv + requiredAlloc - - log.Println("system memory bytes:", available) - log.Println("required model bytes:", requiredModel) - log.Println("required kv bytes:", requiredKv) - log.Println("required alloc bytes:", requiredAlloc) - log.Println("required total bytes:", requiredTotal) + graph := int64(ggml.NumGQA()) * kv / 6 info := gpu.GetGPUInfo() library := info.Library - - if opts.NumGPU == -1 { - // default to offloading all layers - opts.NumGPU = int(ggml.NumLayers()) + 1 - } - - // decide how many layers to put on the GPU - if opts.NumGPU > 0 { - switch runtime.GOOS { - case "darwin": - if requiredTotal > available { - log.Println("not enough vram available, falling back to CPU only") - opts.NumGPU = 0 - } - default: - if library == "cpu" || library == "default" { - opts.NumGPU = 0 - break - } - - // alloc buffer and kv cache is allocated as a fixed amount on the main gpu - // TODO: find the largest GPU and only reserve memory there - avgAvailable := available / int64(info.DeviceCount) - if requiredAlloc > avgAvailable { - log.Printf("not enough vram available, falling back to CPU only") - library = "cpu" - opts.NumGPU = 0 - break - } - - // we don't know which GPU will be used, so estimate - // the scratch buffer space on all of them - // TODO: allocate less layers to the GPU with the scratch buffer - // and more to the others (based on their available memory) - available -= requiredAlloc * int64(info.DeviceCount) - - // no offloading required - if requiredModel+requiredKv <= available { - break - } - - // fill remaining vram with layers - log.Println("splitting", available, "of available memory bytes into layers") - bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers())) - log.Println("bytes per layer:", bytesPerLayer) - layers := available / bytesPerLayer - log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer)) - if layers < int64(opts.NumGPU) { - opts.NumGPU = int(layers) - } + switch runtime.GOOS { + case "darwin": + if opts.NumGPU == 0 { + break } + + if size+kv+graph > vram { + log.Println("not enough vram available, falling back to CPU only") + opts.NumGPU = 0 + break + } + + opts.NumGPU = 1 + default: + if library == "cpu" || library == "default" { + log.Println("GPU not available, falling back to CPU") + opts.NumGPU = 0 + break + } + + // don't use GPU at all if no layers are loaded + if opts.NumGPU == 0 { + library = "cpu" + break + } + + // user-defined GPU count + if opts.NumGPU != -1 { + break + } + + // the "main" GPU needs the most memory and determines the limit + // of how many layers can be loaded. It needs to fit: + // 1. the full compute graph allocation for all devices (graph) + // 2. the proportional kv cache for all devices (kv * % layers) + // 3. the proportional model (size * % layers / # devices) + // This estimates the number of layers + maxlayers := int64(ggml.NumLayers()) + 1 + devices := int64(info.DeviceCount) + avg := vram / devices + layers := maxlayers * (avg - graph) / (kv + size/devices) + if layers > maxlayers { + layers = maxlayers + } + + // 1 + 2 must fit on the main gpu + min := graph + kv*layers/maxlayers + if layers <= 0 || min > avg { + log.Printf("not enough vram available, falling back to CPU only") + library = "cpu" + opts.NumGPU = 0 + break + } + + opts.NumGPU = int(layers) } - opts.NumGQA = 0 opts.RopeFrequencyBase = 0.0 opts.RopeFrequencyScale = 0.0 return newLlmServer(library, model, adapters, projectors, opts)