diff --git a/gpu/gpu.go b/gpu/gpu.go index 21666c8d..f8bae9b0 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -31,8 +31,8 @@ type handles struct { } const ( - cudaMinimumMemory = 457 * format.MebiByte - rocmMinimumMemory = 457 * format.MebiByte + cudaMinimumMemory = 256 * format.MebiByte + rocmMinimumMemory = 256 * format.MebiByte ) var gpuMutex sync.Mutex diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index f8cc1adb..0ba02e1b 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -15,7 +15,7 @@ import ( ) const ( - metalMinimumMemory = 512 * format.MebiByte + metalMinimumMemory = 384 * format.MebiByte ) func GetGPUInfo() GpuInfoList { diff --git a/llm/memory.go b/llm/memory.go index 661a0c50..005a15aa 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -85,19 +85,19 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts graphPartialOffload = graphFullOffload } + layers := ggml.Tensors().Layers() + // memoryRequiredTotal represents the memory required for full GPU offloading (all layers) - memoryRequiredTotal := memoryMinimum + graphFullOffload + memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size() // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers) - memoryRequiredPartial := memoryMinimum + graphPartialOffload + memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size() if memoryRequiredPartial > memoryAvailable { slog.Debug("insufficient VRAM to load any model layers") return 0, 0 } - layers := ggml.Tensors().Layers() - var memoryLayerOutput uint64 if layer, ok := layers["output_norm"]; ok { memoryLayerOutput += layer.size()