diff --git a/format/bytes.go b/format/bytes.go index 9fdc8bcf..13d8575e 100644 --- a/format/bytes.go +++ b/format/bytes.go @@ -53,6 +53,8 @@ func HumanBytes(b int64) string { func HumanBytes2(b uint64) string { switch { + case b >= GibiByte: + return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte) case b >= MebiByte: return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte) case b >= KibiByte: diff --git a/llm/memory.go b/llm/memory.go index 005a15aa..6890b08c 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors // Split up the GPUs by type and try them for _, gpus := range allGpus.ByLibrary() { var layerCount int - layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts) + layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts) if opts.NumGPU < 0 { if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) { return true, estimatedVRAM @@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors return false, estimatedVRAM } -// Given a model and one or more GPU targets, predict how many layers and bytes we can load +// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size // The GPUs provided must all be the same Library -func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) { - if gpus[0].Library == "cpu" { - return 0, 0 - } +func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) { var memoryAvailable uint64 for _, info := range gpus { memoryAvailable += info.FreeMemory @@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers) memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size() - if memoryRequiredPartial > memoryAvailable { - slog.Debug("insufficient VRAM to load any model layers") - return 0, 0 - } - var memoryLayerOutput uint64 if layer, ok := layers["output_norm"]; ok { memoryLayerOutput += layer.size() @@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts ), ), ) - return layerCount, uint64(memoryRequiredPartial) + if gpus[0].Library == "cpu" { + return 0, 0, memoryRequiredTotal + } + if memoryRequiredPartial > memoryAvailable { + slog.Debug("insufficient VRAM to load any model layers") + return 0, 0, memoryRequiredTotal + } + + return layerCount, memoryRequiredPartial, memoryRequiredTotal } diff --git a/llm/server.go b/llm/server.go index b23a7749..78106ea0 100644 --- a/llm/server.go +++ b/llm/server.go @@ -49,7 +49,10 @@ type llmServer struct { options api.Options // TODO - this should be broken down by GPU - estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model + estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model + estimatedTotal uint64 // Total size of model + totalLayers uint64 + gpuCount int sem *semaphore.Weighted } @@ -83,12 +86,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr cpuRunner := "" var estimatedVRAM uint64 + var estimatedTotal uint64 var systemMemory uint64 + gpuCount := len(gpus) if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 { // TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner cpuRunner = serverForCpu() + gpuCount = 0 } else { if gpus[0].Library == "metal" { memInfo, err := gpu.GetCPUMem() @@ -100,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } } var layers int - layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts) + layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts) if gpus[0].Library == "metal" && estimatedVRAM > systemMemory { // disable partial offloading when model is greater than total system memory as this @@ -133,6 +139,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } else { slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath) servers = []string{demandLib} + if strings.HasPrefix(demandLib, "cpu") { + // Omit the GPU flag to silence the warning + opts.NumGPU = -1 + } } } @@ -214,6 +224,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr continue } + if strings.HasPrefix(servers[i], "cpu") { + // TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up + gpuCount = 0 + } + // Find an availableServers port, retry on each iterration in case the failure was a port conflict race port := 0 if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil { @@ -267,12 +282,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } s := &llmServer{ - port: port, - cmd: exec.Command(server, finalParams...), - status: NewStatusWriter(os.Stderr), - options: opts, - estimatedVRAM: estimatedVRAM, - sem: semaphore.NewWeighted(int64(numParallel)), + port: port, + cmd: exec.Command(server, finalParams...), + status: NewStatusWriter(os.Stderr), + options: opts, + estimatedVRAM: estimatedVRAM, + estimatedTotal: estimatedTotal, + sem: semaphore.NewWeighted(int64(numParallel)), + totalLayers: ggml.KV().BlockCount() + 1, + gpuCount: gpuCount, } s.cmd.Env = os.Environ()