diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index 5c45165f..97c8274f 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -27,7 +27,7 @@ const ( GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line // Direct Rendering Manager sysfs location - DRMDeviceDirGlob = "/sys/class/drm/card[0-9]/device" + DRMDeviceDirGlob = "/sys/class/drm/card*/device" DRMTotalMemoryFile = "mem_info_vram_total" DRMUsedMemoryFile = "mem_info_vram_used" diff --git a/gpu/gpu.go b/gpu/gpu.go index 5dcab592..1832667b 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -246,10 +246,6 @@ func GetGPUInfo() GpuInfoList { return GpuInfoList{cpus[0].GpuInfo} } - // TODO - implement - - // TODO refine the discovery to only gather total memory - // On windows we bundle the nvidia library one level above the runner dir depPath := "" if runtime.GOOS == "windows" && envconfig.RunnersDir != "" { diff --git a/gpu/types.go b/gpu/types.go index c712af90..a633e6c7 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -44,14 +44,14 @@ type CPUInfo struct { type CudaGPUInfo struct { GpuInfo - index int // device index + index int // nolint: unused } type CudaGPUInfoList []CudaGPUInfo type RocmGPUInfo struct { GpuInfo - usedFilepath string // linux - index int // device index on windows + usedFilepath string // nolint: unused + index int // nolint: unused } type RocmGPUInfoList []RocmGPUInfo diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index 1f341fc3..e4bc872c 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -38,7 +38,7 @@ func TestMultiModelConcurrency(t *testing.T) { } resp = [2][]string{ []string{"sunlight"}, - []string{"england", "english", "massachusetts", "pilgrims"}, + []string{"england", "english", "massachusetts", "pilgrims", "british"}, } ) var wg sync.WaitGroup @@ -229,5 +229,23 @@ func TestMultiModelStress(t *testing.T) { } }(i) } + go func() { + for { + time.Sleep(2 * time.Second) + select { + case <-ctx.Done(): + return + default: + models, err := client.ListRunning(ctx) + if err != nil { + slog.Warn("failed to list running models", "error", err) + continue + } + for _, m := range models.Models { + slog.Info("loaded model snapshot", "model", m) + } + } + } + }() wg.Wait() } diff --git a/integration/context_test.go b/integration/context_test.go index 08033125..75efb435 100644 --- a/integration/context_test.go +++ b/integration/context_test.go @@ -11,7 +11,7 @@ import ( ) func TestContextExhaustion(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter? + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute) // Longer needed for small footprint GPUs defer cancel() // Set up the test data req := api.GenerateRequest{ diff --git a/integration/utils_test.go b/integration/utils_test.go index c6f19e98..5da6fc72 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -331,7 +331,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { [][]string{ []string{"sunlight"}, []string{"soil", "organic", "earth", "black", "tan"}, - []string{"england", "english", "massachusetts", "pilgrims"}, + []string{"england", "english", "massachusetts", "pilgrims", "british"}, []string{"fourth", "july", "declaration", "independence"}, []string{"nitrogen", "oxygen", "carbon", "dioxide"}, } diff --git a/llm/ggml.go b/llm/ggml.go index 645447d5..35b89d16 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -307,6 +307,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui partialOffload = 4 * batch * embedding partialOffload += max( + // 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()), 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV), 4*batch*(embedding+vocab)+embedding*vocab*105/128, ) diff --git a/llm/memory.go b/llm/memory.go index 1c2e476b..6f830cb1 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -3,9 +3,10 @@ package llm import ( "fmt" "log/slog" + "strconv" + "strings" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" "github.com/ollama/ollama/gpu" ) @@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors var estimatedVRAM uint64 for _, gpus := range allGpus.ByLibrary() { var layerCount int - layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts) + estimate := EstimateGPULayers(gpus, ggml, projectors, opts) + layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize if opts.NumGPU < 0 { if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) { return true, estimatedVRAM @@ -30,24 +32,68 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors return false, estimatedVRAM } +type MemoryEstimate struct { + // How many layers we predict we can load + Layers int + + // The size of the graph which occupies the main GPU + Graph uint64 + + // How much VRAM will be allocated given the number of layers we predict + VRAMSize uint64 + + // The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize + TotalSize uint64 + + // For multi-GPU scenarios, this provides the tensor split parameter + TensorSplit string + + // For multi-GPU scenarios, this is the size in bytes per GPU + GPUSizes []uint64 +} + // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size // The GPUs provided must all be the same Library -func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) { - var memoryAvailable uint64 - for _, info := range gpus { - memoryAvailable += info.FreeMemory - } - if envconfig.MaxVRAM > 0 { - memoryAvailable = envconfig.MaxVRAM - } +func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate { + // Graph size for a partial offload, applies to all GPUs + var graphPartialOffload uint64 - slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable)) + // Graph size when all layers are offloaded, applies to all GPUs + var graphFullOffload uint64 - // TODO - this is probably wrong, first GPU vs secondaries will have different overheads - memoryMinimum := gpus[0].MinimumMemory + // Final graph offload once we know full or partial + var graphOffload uint64 + + // Projectors loaded into GPU0 only + var projectorSize uint64 + + // Conditional output size on GPU 0 + var memoryLayerOutput uint64 + var includeOutput bool + + // One extra layer as a pad for each GPU + var layerBuffer uint64 + + // The sizes of the main layers + var layerSizes []uint64 + + // The sum of all the layer sizes (just for logging) + var memoryWeights uint64 + + // True if all the layers are loaded + var fullyLoaded bool + + // Overflow that didn't fit into the GPU + var overflow uint64 + + availableList := make([]string, len(gpus)) + for i, gpu := range gpus { + availableList[i] = format.HumanBytes2(gpu.FreeMemory) + } + slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList) for _, projector := range projectors { - memoryMinimum += projectorMemoryRequirements(projector) + projectorSize += projectorMemoryRequirements(projector) // multimodal models require at least 2048 context opts.NumCtx = max(opts.NumCtx, 2048) @@ -56,40 +102,28 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts layers := ggml.Tensors().Layers() // add one layer worth of memory as a buffer if blk0, ok := layers["blk.0"]; ok { - memoryMinimum += blk0.size() + layerBuffer = blk0.size() } // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() - graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch))) + graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch))) if graphPartialOffload == 0 { graphPartialOffload = ggml.KV().GQA() * kv / 6 } - if graphFullOffload == 0 { graphFullOffload = graphPartialOffload } - graphFullOffload *= uint64(len(gpus)) - graphPartialOffload *= uint64(len(gpus)) - // on metal there's no partial offload overhead if gpus[0].Library == "metal" { graphPartialOffload = graphFullOffload } - // memoryRequiredTotal represents the memory required for full GPU offloading (all layers) - memoryRequiredTotal := memoryMinimum + graphFullOffload - - // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers) - memoryRequiredPartial := memoryMinimum + graphPartialOffload - - var memoryLayerOutput uint64 if layer, ok := layers["output_norm"]; ok { memoryLayerOutput += layer.size() } - if layer, ok := layers["output"]; ok { memoryLayerOutput += layer.size() } else if layer, ok := layers["token_embd"]; ok { @@ -97,38 +131,144 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts } if gpus[0].Library == "metal" && opts.UseMMap { - // memory is preallocated for output tensors - memoryRequiredTotal += memoryLayerOutput - memoryRequiredPartial += memoryLayerOutput + includeOutput = true + } else if gpus[0].Library != "metal" || !opts.UseMMap { + includeOutput = true } + gpuZeroOverhead := projectorSize + if includeOutput { + gpuZeroOverhead += memoryLayerOutput + } + + // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer var layerCount int + layerCounts := make([]int, len(gpus)) + gpuAllocations := make([]uint64, len(gpus)) + type gs struct { + i int + g *gpu.GpuInfo + } + gpusWithSpace := []gs{} + for i := range gpus { + var gzo uint64 + if len(gpusWithSpace) == 0 { + gzo = gpuZeroOverhead + } + // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer + if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer { + slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i]) + continue + } + gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]}) + gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full + } + + var gpuZeroID int + if len(gpusWithSpace) > 0 { + gpuZeroID = gpusWithSpace[0].i + gpuAllocations[gpuZeroID] += gpuZeroOverhead + } + + layerSizes = make([]uint64, int(ggml.KV().BlockCount())) for i := range int(ggml.KV().BlockCount()) { if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { memoryLayer := blk.size() // KV is proportional to the number of layers memoryLayer += kv / ggml.KV().BlockCount() - - memoryRequiredTotal += memoryLayer - if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) { - memoryRequiredPartial += memoryLayer - layerCount++ - } + layerSizes[i] = memoryLayer + memoryWeights += memoryLayer } } - if gpus[0].Library != "metal" || !opts.UseMMap { - // memory was not preallocated for output tensors - memoryRequiredTotal += memoryLayerOutput + // For all the layers, find where they can fit on the GPU(s) + for i := range layerSizes { + if layerSizes[i] == 0 { + continue + } + if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { + // Stop allocating on GPU(s) once we hit the users target NumGPU + continue + } + + // distribute the layers across the GPU(s) that have space + for j := len(gpusWithSpace); j > 0; j-- { + g := gpusWithSpace[i%j] + used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) + if g.g.FreeMemory > used+layerSizes[i] { + gpuAllocations[g.i] += layerSizes[i] + layerCounts[g.i]++ + layerCount++ + break + } else { + gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...) + } + } + + } + if layerCount >= int(ggml.KV().BlockCount()) { + fullyLoaded = true + } else { + for i := layerCount; i < int(ggml.KV().BlockCount()); i++ { + overflow += layerSizes[i] + } + } + // Find where the output fits + if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) { + for j := len(gpusWithSpace); j > 0; j-- { + g := gpusWithSpace[layerCount%j] + used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) + if g.g.FreeMemory > used+memoryLayerOutput { + gpuAllocations[g.i] += memoryLayerOutput + layerCounts[g.i]++ + layerCount++ + break + } + } + if layerCount < int(ggml.KV().BlockCount())+1 { + fullyLoaded = false + overflow += memoryLayerOutput + } } - if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) { - layerCount = int(ggml.KV().BlockCount()) + 1 - memoryRequiredPartial = memoryRequiredTotal + // Add the applicable (full or partial) graph allocations + for i := range gpus { + if layerCounts[i] <= 0 { + continue + } + if fullyLoaded { + gpuAllocations[i] += graphFullOffload + } else { + gpuAllocations[i] += graphPartialOffload + } + } + if fullyLoaded { + graphOffload = graphFullOffload + } else { + graphOffload = graphPartialOffload } - memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv + // Summaries for the log + var memoryRequiredPartial, memoryRequiredTotal uint64 + for i := range gpuAllocations { + memoryRequiredPartial += gpuAllocations[i] + + } + memoryRequiredTotal = memoryRequiredPartial + overflow + + tensorSplit := "" + if len(gpus) > 1 { + splits := make([]string, len(gpus)) + for i, count := range layerCounts { + splits[i] = strconv.Itoa(count) + } + tensorSplit = strings.Join(splits, ",") + } + allocationsList := []string{} + for _, a := range gpuAllocations { + allocationsList = append(allocationsList, format.HumanBytes2(a)) + } slog.Info( "offload to gpu", @@ -136,13 +276,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts "layers", // requested number of layers to offload "requested", opts.NumGPU, + // The number of layers the model has (including output) + "model", int(ggml.KV().BlockCount())+1, // estimated number of layers that can be offloaded - "real", layerCount, + "offload", layerCount, + // multi-gpu split for tesnors + "split", tensorSplit, ), slog.Group( "memory", - // memory available for offloading - "available", format.HumanBytes2(memoryAvailable), + // memory available by GPU for offloading + "available", availableList, slog.Group( "required", // memory required for full offloading @@ -151,6 +295,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts "partial", format.HumanBytes2(memoryRequiredPartial), // memory of KV cache "kv", format.HumanBytes2(kv), + // Allocations across the GPUs + "allocations", allocationsList, ), slog.Group( "weights", @@ -171,12 +317,31 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts ), ) if gpus[0].Library == "cpu" { - return 0, 0, memoryRequiredTotal + return MemoryEstimate{ + Layers: 0, + Graph: 0, + VRAMSize: 0, + TotalSize: memoryRequiredTotal, + GPUSizes: []uint64{}, + } } - if memoryRequiredPartial > memoryAvailable { + if layerCount == 0 { slog.Debug("insufficient VRAM to load any model layers") - return 0, 0, memoryRequiredTotal + return MemoryEstimate{ + Layers: 0, + Graph: 0, + VRAMSize: 0, + TotalSize: memoryRequiredTotal, + GPUSizes: []uint64{}, + } } - return layerCount, memoryRequiredPartial, memoryRequiredTotal + return MemoryEstimate{ + Layers: layerCount, + Graph: graphOffload, + VRAMSize: memoryRequiredPartial, + TotalSize: memoryRequiredTotal, + TensorSplit: tensorSplit, + GPUSizes: gpuAllocations, + } } diff --git a/llm/memory_test.go b/llm/memory_test.go new file mode 100644 index 00000000..0adbc541 --- /dev/null +++ b/llm/memory_test.go @@ -0,0 +1,116 @@ +package llm + +import ( + "bytes" + "encoding/binary" + "fmt" + "os" + "testing" + + "github.com/ollama/ollama/api" + "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/gpu" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestEstimateGPULayers(t *testing.T) { + envconfig.Debug = true + modelName := "dummy" + f, err := os.CreateTemp(t.TempDir(), modelName) + assert.Nil(t, err) + defer f.Close() + gguf := NewGGUFV3(binary.LittleEndian) + inputLayerCount := 5 + tensors := []Tensor{ + {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + } + assert.Equal(t, inputLayerCount+1, len(tensors)) + err = gguf.Encode(f, KV{ + "general.architecture": "llama", + "general.name": "name", + "llama.context_length": uint32(32), + "llama.embedding_length": uint32(4096), + "llama.block_count": uint32(inputLayerCount), + "llama.attention.head_count": uint32(32), + "llama.attention.head_count_kv": uint32(32), + "tokenizer.ggml.tokens": []string{" "}, + "tokenizer.ggml.scores": []float32{0}, + "tokenizer.ggml.token_type": []int32{0}, + }, tensors) + require.NoError(t, err) + + ggml, err := LoadModel(f.Name()) + require.NoError(t, err) + + // Simple CPU scenario + gpus := []gpu.GpuInfo{ + { + Library: "cpu", + }, + } + projectors := []string{} + opts := api.DefaultOptions() + estimate := EstimateGPULayers(gpus, ggml, projectors, opts) + assert.Equal(t, 0, estimate.Layers) + assert.Equal(t, uint64(0), estimate.Graph) + + // derived from the dummy ggml file above + graphPartialOffload := uint64(202377216) + graphFullOffload := uint64(171968512) + layerSize := uint64(33554436) + projectorSize := uint64(0) + memoryLayerOutput := uint64(4) + + // Dual CUDA scenario with assymetry + gpuMinimumMemory := uint64(2048) + gpus = []gpu.GpuInfo{ + { + Library: "cuda", + MinimumMemory: gpuMinimumMemory, + }, + { + Library: "cuda", + MinimumMemory: gpuMinimumMemory, + }, + } + // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1 + for i, s := range [][]uint64{ + {1, 1, 1, 1}, + {2, 1, 2, 1}, + {2, 2, 2, 2}, + {1, 2, 1, 2}, + {3, 3, 3, 3}, + {4, 4, 3, 3}, + {6, 6, 3, 3}, + {0, 3, 0, 3}, + } { + gpus[0].FreeMemory = 0 + gpus[1].FreeMemory = 0 + gpus[0].FreeMemory += projectorSize + memoryLayerOutput + gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1 + gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1 + gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload) + gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload) + estimate = EstimateGPULayers(gpus, ggml, projectors, opts) + assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s) + assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s) + var layerSums uint64 + for _, b := range estimate.GPUSizes { + layerSums += b + } + if estimate.Layers < inputLayerCount+1 { + assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) + assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate) + } else { + assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) + assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate) + } + } + +} diff --git a/llm/server.go b/llm/server.go index 0a815798..eb3d6365 100644 --- a/llm/server.go +++ b/llm/server.go @@ -49,13 +49,11 @@ type llmServer struct { status *StatusWriter options api.Options - // TODO - this should be broken down by GPU - estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model - estimatedTotal uint64 // Total size of model - totalLayers uint64 - gpuCount int - loadDuration time.Duration // Record how long it took the model to load - loadProgress float32 + estimate MemoryEstimate + totalLayers uint64 + gpuCount int + loadDuration time.Duration // Record how long it took the model to load + loadProgress float32 sem *semaphore.Weighted } @@ -80,8 +78,7 @@ func LoadModel(model string) (*GGML, error) { func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) { var err error var cpuRunner string - var estimatedVRAM uint64 - var estimatedTotal uint64 + var estimate MemoryEstimate var systemMemory uint64 gpuCount := len(gpus) if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 { @@ -89,7 +86,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr cpuRunner = serverForCpu() gpuCount = 0 - _, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts) + estimate = EstimateGPULayers(gpus, ggml, projectors, opts) } else { if gpus[0].Library == "metal" { memInfo, err := gpu.GetCPUMem() @@ -100,20 +97,19 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr slog.Debug("system memory", "total", format.HumanBytes2(systemMemory)) } } - var layers int - layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts) + estimate = EstimateGPULayers(gpus, ggml, projectors, opts) switch { - case gpus[0].Library == "metal" && estimatedVRAM > systemMemory: + case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory: // disable partial offloading when model is greater than total system memory as this // can lead to locking up the system opts.NumGPU = 0 - case gpus[0].Library != "metal" && layers == 0: + case gpus[0].Library != "metal" && estimate.Layers == 0: // Don't bother loading into the GPU if no layers can fit cpuRunner = serverForCpu() gpuCount = 0 - case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu": - opts.NumGPU = layers + case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": + opts.NumGPU = estimate.Layers } } @@ -232,6 +228,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--parallel", fmt.Sprintf("%d", numParallel)) + if estimate.TensorSplit != "" { + params = append(params, "--tensor-split", estimate.TensorSplit) + } + + if estimate.TensorSplit != "" { + params = append(params, "--tensor-split", estimate.TensorSplit) + } + for i := range len(servers) { dir := availableServers[servers[i]] if dir == "" { @@ -299,16 +303,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } s := &llmServer{ - port: port, - cmd: exec.Command(server, finalParams...), - status: NewStatusWriter(os.Stderr), - options: opts, - estimatedVRAM: estimatedVRAM, - estimatedTotal: estimatedTotal, - sem: semaphore.NewWeighted(int64(numParallel)), - totalLayers: ggml.KV().BlockCount() + 1, - gpuCount: gpuCount, - done: make(chan error, 1), + port: port, + cmd: exec.Command(server, finalParams...), + status: NewStatusWriter(os.Stderr), + options: opts, + estimate: estimate, + sem: semaphore.NewWeighted(int64(numParallel)), + totalLayers: ggml.KV().BlockCount() + 1, + gpuCount: gpuCount, + done: make(chan error, 1), } s.cmd.Env = os.Environ() @@ -1004,11 +1007,11 @@ func (s *llmServer) Close() error { } func (s *llmServer) EstimatedVRAM() uint64 { - return s.estimatedVRAM + return s.estimate.VRAMSize } func (s *llmServer) EstimatedTotal() uint64 { - return s.estimatedTotal + return s.estimate.TotalSize } func parseDurationMs(ms float64) time.Duration { diff --git a/server/sched_test.go b/server/sched_test.go index f7dce6d1..e7ea5874 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -129,6 +129,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV "tokenizer.ggml.token_type": []int32{0}, }, []llm.Tensor{ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, }) require.NoError(t, err)