diff --git a/format/bytes.go b/format/bytes.go index 01be17ce..a0b8008d 100644 --- a/format/bytes.go +++ b/format/bytes.go @@ -6,11 +6,15 @@ import ( ) const ( - Byte = 1 + Byte = 1 + KiloByte = Byte * 1000 MegaByte = KiloByte * 1000 GigaByte = MegaByte * 1000 TeraByte = GigaByte * 1000 + + KibiByte = Byte * 1024 + MebiByte = KibiByte * 1024 ) func HumanBytes(b int64) string { @@ -45,3 +49,14 @@ func HumanBytes(b int64) string { return fmt.Sprintf("%d %s", int(value), unit) } } + +func HumanBytes2(b int64) string { + switch { + case b >= MebiByte: + return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte) + case b >= KibiByte: + return fmt.Sprintf("%.1f KiB", float64(b)/KibiByte) + default: + return fmt.Sprintf("%d B", b) + } +} diff --git a/gpu/gpu.go b/gpu/gpu.go index d09e94e4..dec3f95e 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -20,6 +20,8 @@ import ( "strings" "sync" "unsafe" + + "github.com/ollama/ollama/format" ) type handles struct { @@ -27,6 +29,11 @@ type handles struct { cudart *C.cudart_handle_t } +const ( + cudaMinimumMemory = 377 * format.MebiByte + rocmMinimumMemory = 377 * format.MebiByte +) + var gpuMutex sync.Mutex var gpuHandles *handles = nil @@ -168,6 +175,7 @@ func GetGPUInfo() GpuInfo { } else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) { slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor)) resp.Library = "cuda" + resp.MinimumMemory = cudaMinimumMemory } else { slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)) } @@ -187,6 +195,7 @@ func GetGPUInfo() GpuInfo { } else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) { slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor)) resp.Library = "cuda" + resp.MinimumMemory = cudaMinimumMemory } else { slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)) } @@ -194,6 +203,7 @@ func GetGPUInfo() GpuInfo { } else { AMDGetGPUInfo(&resp) if resp.Library != "" { + resp.MinimumMemory = rocmMinimumMemory return resp } } @@ -239,20 +249,7 @@ func CheckVRAM() (int64, error) { } gpuInfo := GetGPUInfo() if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { - // leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead - overhead := gpuInfo.FreeMemory / 10 - gpus := uint64(gpuInfo.DeviceCount) - if overhead < gpus*1024*1024*1024 { - overhead = gpus * 1024 * 1024 * 1024 - } - // Assigning full reported free memory for Tegras due to OS controlled caching. - if CudaTegra != "" { - // Setting overhead for non-Tegra devices - overhead = 0 - } - avail := int64(gpuInfo.FreeMemory - overhead) - slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024)) - return avail, nil + return int64(gpuInfo.FreeMemory), nil } return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation diff --git a/gpu/types.go b/gpu/types.go index 67727180..6191e196 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -14,6 +14,9 @@ type GpuInfo struct { // Optional variant to select (e.g. versions, cpu feature flags) Variant string `json:"variant,omitempty"` + // MinimumMemory represents the minimum memory required to use the GPU + MinimumMemory int64 `json:"-"` + // TODO add other useful attributes about the card here for discovery information } diff --git a/llm/dyn_ext_server.go b/llm/dyn_ext_server.go index 6e43333e..7bd2067d 100644 --- a/llm/dyn_ext_server.go +++ b/llm/dyn_ext_server.go @@ -39,7 +39,7 @@ import ( type dynExtServer struct { s C.struct_dynamic_llama_server - options api.Options + options *api.Options } // Note: current implementation does not support concurrent instantiations @@ -64,7 +64,7 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error { return fmt.Errorf(C.GoString(resp.msg)) } -func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) { +func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) { if !mutex.TryLock() { slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete") mutex.Lock() diff --git a/llm/ggml.go b/llm/ggml.go index 98a42298..4b73f510 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "strings" ) type GGML struct { @@ -12,6 +13,16 @@ type GGML struct { model } +func (ggml *GGML) LayerSize(prefix string) (n int64) { + for _, t := range ggml.Tensors() { + if strings.HasPrefix(t.Name, prefix) { + n += int64(t.size()) + } + } + + return +} + const ( fileTypeF32 uint32 = iota fileTypeF16 diff --git a/llm/llm.go b/llm/llm.go index 044e9842..c0d2c6d3 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -5,10 +5,11 @@ import ( "fmt" "log/slog" "os" - "runtime" "slices" + "strings" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/format" "github.com/ollama/ollama/gpu" ) @@ -24,7 +25,7 @@ var cpuOnlyFamilies = []string{ "mamba", } -func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) { +func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) { if _, err := os.Stat(model); err != nil { return nil, err } @@ -35,7 +36,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er } defer f.Close() - ggml, size, err := DecodeGGML(f) + ggml, _, err := DecodeGGML(f) if err != nil { return nil, err } @@ -49,92 +50,101 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er opts.NumCtx = 4 } - vram, _ := gpu.CheckVRAM() + availableMemory, _ := gpu.CheckVRAM() + info := gpu.GetGPUInfo() - // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value - kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount()) + usedMemory := info.MinimumMemory + for _, projector := range projectors { + usedMemory += projectorMemoryRequirements(projector) + + // multimodal models require at least 2048 context + opts.NumCtx = max(opts.NumCtx, 2048) + } + + // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv + kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV()) // this amount is the overhead + tensors in memory // TODO: get this from the llama.cpp's graph calculations instead of // estimating it's 1/6 * kv_cache_size * num_gqa graph := int64(ggml.KV().GQA()) * kv / 6 + usedMemory += graph - if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) { - opts.NumGPU = 0 + if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) { + info.Library = "cpu" } - info := gpu.GetGPUInfo() - switch runtime.GOOS { - case "darwin": - if opts.NumGPU == 0 { - break - } + requiredMemory := usedMemory - if size+kv+graph > vram { - slog.Info("not enough vram available, setting num_gpu=0") - opts.NumGPU = 0 - break - } + var layers int + for i := 0; i < int(ggml.KV().BlockCount()); i++ { + layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount()) + requiredMemory += layerMemory - // TODO: implement layer splitting on macOS - opts.NumGPU = 999 - default: - if info.Library == "cpu" { - slog.Info("GPU not available, falling back to CPU") - opts.NumGPU = 0 - break + if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) { + usedMemory += layerMemory + layers++ } - - // don't use GPU at all if no layers are loaded - if opts.NumGPU == 0 { - info.Library = "cpu" - info.Variant = gpu.GetCPUVariant() - break - } - - // user-defined GPU count - if opts.NumGPU != -1 { - break - } - - // the "main" GPU needs the most memory and determines the limit - // of how many layers can be loaded. It needs to fit: - // 1. the full compute graph allocation for all devices (graph) - // 2. the proportional kv cache for all devices (kv * % layers) - // 3. the proportional model (size * % layers / # devices) - // This estimates the number of layers - maxlayers := int64(ggml.KV().BlockCount()) + 1 - devices := int64(info.DeviceCount) - avg := vram / devices - layers := maxlayers * (avg - graph) / (kv + size/devices) - if layers > maxlayers { - layers = maxlayers - } - - // 1 + 2 must fit on the main gpu - min := graph + kv*layers/maxlayers - if layers <= 0 || min > avg { - slog.Info("not enough vram available, falling back to CPU only") - info.Library = "cpu" - info.Variant = gpu.GetCPUVariant() - opts.NumGPU = 0 - break - } - - opts.NumGPU = int(layers) } - opts.RopeFrequencyBase = 0.0 - opts.RopeFrequencyScale = 0.0 + memOutputLayer := ggml.LayerSize("output.") + requiredMemory += memOutputLayer + + // only offload output layer if all repeating layers are offloaded + if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer { + usedMemory += memOutputLayer + layers++ + } + + slog.Info( + "offload to gpu", + "layers", layers, + "required", format.HumanBytes2(requiredMemory), + "used", format.HumanBytes2(usedMemory), + "available", format.HumanBytes2(availableMemory), + "kv", format.HumanBytes2(kv), + "graph", format.HumanBytes2(graph), + ) + + if opts.NumGPU < 0 && info.Library != "cpu" { + opts.NumGPU = layers + } + return newLlmServer(info, model, adapters, projectors, opts) } +func projectorMemoryRequirements(filename string) int64 { + file, err := os.Open(filename) + if err != nil { + return 0 + } + defer file.Close() + + ggml, _, err := DecodeGGML(file) + if err != nil { + return 0 + } + + prefixes := make(map[string]struct{}) + for _, layer := range ggml.Tensors() { + parts := strings.Split(layer.Name, ".") + prefixes[strings.Join(parts[:2], ".")] = struct{}{} + } + + var ask int64 + for prefix := range prefixes { + ask += ggml.LayerSize(prefix) + } + + return ask +} + // Give any native cgo implementations an opportunity to initialize func Init() error { return nativeInit() } -func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) { +func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) { dynLibs := getDynLibs(gpuInfo) // Check to see if the user has requested a specific library instead of auto-detecting diff --git a/server/routes.go b/server/routes.go index 77eb77ea..62fa86c9 100644 --- a/server/routes.go +++ b/server/routes.go @@ -68,7 +68,7 @@ var loaded struct { var defaultSessionDuration = 5 * time.Minute // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function -func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error { +func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.Duration) error { needLoad := loaded.runner == nil || // is there a model loaded? loaded.ModelPath != model.ModelPath || // has the base model changed? !reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed? @@ -97,7 +97,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D loaded.Model = model loaded.runner = llmRunner - loaded.Options = &opts + loaded.Options = opts } loaded.expireAt = time.Now().Add(sessionDuration) @@ -214,7 +214,7 @@ func GenerateHandler(c *gin.Context) { sessionDuration = req.KeepAlive.Duration } - if err := load(c, model, opts, sessionDuration); err != nil { + if err := load(c, model, &opts, sessionDuration); err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } @@ -460,7 +460,7 @@ func EmbeddingsHandler(c *gin.Context) { sessionDuration = req.KeepAlive.Duration } - if err := load(c, model, opts, sessionDuration); err != nil { + if err := load(c, model, &opts, sessionDuration); err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } @@ -1267,7 +1267,7 @@ func ChatHandler(c *gin.Context) { sessionDuration = req.KeepAlive.Duration } - if err := load(c, model, opts, sessionDuration); err != nil { + if err := load(c, model, &opts, sessionDuration); err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return }