ollama/llm/memory.go

package llm

import (
	"fmt"
	"log/slog"

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/envconfig"
	"github.com/ollama/ollama/format"
	"github.com/ollama/ollama/gpu"
)

// This algorithm looks for a complete fit to determine if we need to unload other models
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
	// Split up the GPUs by type and try them
	var estimatedVRAM uint64
	for _, gpus := range allGpus.ByLibrary() {
		var layerCount int
		layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
		if opts.NumGPU < 0 {
			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
				return true, estimatedVRAM
			}
		} else {
			if layerCount > 0 && layerCount >= opts.NumGPU {
				return true, estimatedVRAM
			}
		}
	}
	return false, estimatedVRAM
}

// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
	var memoryAvailable uint64
	for _, info := range gpus {
		memoryAvailable += info.FreeMemory
	}
	if envconfig.MaxVRAM > 0 {
		memoryAvailable = envconfig.MaxVRAM
	}

	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))

	// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
	memoryMinimum := gpus[0].MinimumMemory

	for _, projector := range projectors {
		memoryMinimum += projectorMemoryRequirements(projector)

		// multimodal models require at least 2048 context
		opts.NumCtx = max(opts.NumCtx, 2048)
	}

	layers := ggml.Tensors().Layers()
	// add one layer worth of memory as a buffer
	if blk0, ok := layers["blk.0"]; ok {
		memoryMinimum += blk0.size()
	}

	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()

	graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
	if graphPartialOffload == 0 {
		graphPartialOffload = ggml.KV().GQA() * kv / 6
	}

	if graphFullOffload == 0 {
		graphFullOffload = graphPartialOffload
	}

	graphFullOffload *= uint64(len(gpus))
	graphPartialOffload *= uint64(len(gpus))

	// on metal there's no partial offload overhead
	if gpus[0].Library == "metal" {
		graphPartialOffload = graphFullOffload
	}

	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
	memoryRequiredTotal := memoryMinimum + graphFullOffload

	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
	memoryRequiredPartial := memoryMinimum + graphPartialOffload

	var memoryLayerOutput uint64
	if layer, ok := layers["output_norm"]; ok {
		memoryLayerOutput += layer.size()
	}

	if layer, ok := layers["output"]; ok {
		memoryLayerOutput += layer.size()
	} else if layer, ok := layers["token_embd"]; ok {
		memoryLayerOutput += layer.size()
	}

	if gpus[0].Library == "metal" && opts.UseMMap {
		// memory is preallocated for output tensors
		memoryRequiredTotal += memoryLayerOutput
		memoryRequiredPartial += memoryLayerOutput
	}

	var layerCount int
	for i := range int(ggml.KV().BlockCount()) {
		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
			memoryLayer := blk.size()

			// KV is proportional to the number of layers
			memoryLayer += kv / ggml.KV().BlockCount()

			memoryRequiredTotal += memoryLayer
			if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
				memoryRequiredPartial += memoryLayer
				layerCount++
			}
		}
	}

	if gpus[0].Library != "metal" || !opts.UseMMap {
		// memory was not preallocated for output tensors
		memoryRequiredTotal += memoryLayerOutput
	}

	if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
		layerCount = int(ggml.KV().BlockCount()) + 1
		memoryRequiredPartial = memoryRequiredTotal
	}

	memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv

	slog.Info(
		"offload to gpu",
		slog.Group(
			"layers",
			// requested number of layers to offload
			"requested", opts.NumGPU,
			// estimated number of layers that can be offloaded
			"real", layerCount,
		),
		slog.Group(
			"memory",
			// memory available for offloading
			"available", format.HumanBytes2(memoryAvailable),
			slog.Group(
				"required",
				// memory required for full offloading
				"full", format.HumanBytes2(memoryRequiredTotal),
				// memory required to offload layers.estimate layers
				"partial", format.HumanBytes2(memoryRequiredPartial),
				// memory of KV cache
				"kv", format.HumanBytes2(kv),
			),
			slog.Group(
				"weights",
				// memory of the weights
				"total", format.HumanBytes2(memoryWeights),
				// memory of repeating layers
				"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
				// memory of non-repeating layers
				"nonrepeating", format.HumanBytes2(memoryLayerOutput),
			),
			slog.Group(
				"graph",
				// memory of graph when fully offloaded
				"full", format.HumanBytes2(graphFullOffload),
				// memory of graph when not fully offloaded
				"partial", format.HumanBytes2(graphPartialOffload),
			),
		),
	)
	if gpus[0].Library == "cpu" {
		return 0, 0, memoryRequiredTotal
	}
	if memoryRequiredPartial > memoryAvailable {
		slog.Debug("insufficient VRAM to load any model layers")
		return 0, 0, memoryRequiredTotal
	}

	return layerCount, memoryRequiredPartial, memoryRequiredTotal
}
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`package llm`

			`import (`
			`"fmt"`
			`"log/slog"`

			`"github.com/ollama/ollama/api"`
gofmt, goimports 2024-06-04 18:53:23 +00:00			`"github.com/ollama/ollama/envconfig"`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`"github.com/ollama/ollama/format"`
			`"github.com/ollama/ollama/gpu"`
			`)`

			`// This algorithm looks for a complete fit to determine if we need to unload other models`
			`func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {`
			`// Split up the GPUs by type and try them`
Don't clamp ctx size in `PredictServerFit` (#4317) * dont clamp ctx size in `PredictServerFit` * minimum 4 context * remove context warning 2024-05-10 17:17:12 +00:00			`var estimatedVRAM uint64`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`for _, gpus := range allGpus.ByLibrary() {`
			`var layerCount int`
Record GPU usage information This records more GPU usage information for eventual UX inclusion. 2024-05-04 16:15:31 +00:00			`layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`if opts.NumGPU < 0 {`
			`if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {`
			`return true, estimatedVRAM`
			`}`
			`} else {`
			`if layerCount > 0 && layerCount >= opts.NumGPU {`
			`return true, estimatedVRAM`
			`}`
			`}`
			`}`
			`return false, estimatedVRAM`
			`}`

Record GPU usage information This records more GPU usage information for eventual UX inclusion. 2024-05-04 16:15:31 +00:00			`// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`// The GPUs provided must all be the same Library`
Record GPU usage information This records more GPU usage information for eventual UX inclusion. 2024-05-04 16:15:31 +00:00			`func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`var memoryAvailable uint64`
			`for _, info := range gpus {`
			`memoryAvailable += info.FreeMemory`
			`}`
Centralize server config handling This moves all the env var reading into one central module and logs the loaded config once at startup which should help in troubleshooting user server logs 2024-05-04 18:46:01 +00:00			`if envconfig.MaxVRAM > 0 {`
			`memoryAvailable = envconfig.MaxVRAM`
Add back memory escape valve If we get our predictions wrong, this can be used to set a lower memory limit as a workaround. Recent multi-gpu refactoring accidentally removed it, so this adds it back. 2024-04-24 00:09:02 +00:00			`}`

Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))`

			`// TODO - this is probably wrong, first GPU vs secondaries will have different overheads`
			`memoryMinimum := gpus[0].MinimumMemory`

			`for _, projector := range projectors {`
			`memoryMinimum += projectorMemoryRequirements(projector)`

			`// multimodal models require at least 2048 context`
			`opts.NumCtx = max(opts.NumCtx, 2048)`
			`}`

count memory up to NumGPU 2024-05-10 21:40:37 +00:00			`layers := ggml.Tensors().Layers()`
typo 2024-05-13 21:14:10 +00:00			`// add one layer worth of memory as a buffer`
			`if blk0, ok := layers["blk.0"]; ok {`
			`memoryMinimum += blk0.size()`
			`}`
count memory up to NumGPU 2024-05-10 21:40:37 +00:00
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv`
			`var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()`

			`graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))`
			`if graphPartialOffload == 0 {`
			`graphPartialOffload = ggml.KV().GQA() * kv / 6`
			`}`

			`if graphFullOffload == 0 {`
			`graphFullOffload = graphPartialOffload`
			`}`

			`graphFullOffload *= uint64(len(gpus))`
			`graphPartialOffload *= uint64(len(gpus))`

gpu: add 512MiB to darwin minimum, metal doesn't have partial offloading overhead (#4068) 2024-05-01 15:46:03 +00:00			`// on metal there's no partial offload overhead`
			`if gpus[0].Library == "metal" {`
			`graphPartialOffload = graphFullOffload`
			`}`

Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)`
count memory up to NumGPU 2024-05-10 21:40:37 +00:00			`memoryRequiredTotal := memoryMinimum + graphFullOffload`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00
			`// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)`
count memory up to NumGPU 2024-05-10 21:40:37 +00:00			`memoryRequiredPartial := memoryMinimum + graphPartialOffload`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00
only count output tensors 2024-04-25 21:41:50 +00:00			`var memoryLayerOutput uint64`
fix gemma, command-r layer weights 2024-04-26 22:00:54 +00:00			`if layer, ok := layers["output_norm"]; ok {`
			`memoryLayerOutput += layer.size()`
			`}`

			`if layer, ok := layers["output"]; ok {`
			`memoryLayerOutput += layer.size()`
			`} else if layer, ok := layers["token_embd"]; ok {`
			`memoryLayerOutput += layer.size()`
only count output tensors 2024-04-25 21:41:50 +00:00			`}`

			`if gpus[0].Library == "metal" && opts.UseMMap {`
			`// memory is preallocated for output tensors`
			`memoryRequiredTotal += memoryLayerOutput`
			`memoryRequiredPartial += memoryLayerOutput`
			`}`

			`var layerCount int`
lint 2024-05-22 05:21:04 +00:00			`for i := range int(ggml.KV().BlockCount()) {`
typo 2024-05-13 21:14:10 +00:00			`if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {`
			`memoryLayer := blk.size()`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00
typo 2024-05-13 21:14:10 +00:00			`// KV is proportional to the number of layers`
			`memoryLayer += kv / ggml.KV().BlockCount()`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00
typo 2024-05-13 21:14:10 +00:00			`memoryRequiredTotal += memoryLayer`
			`if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) \|\| (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {`
			`memoryRequiredPartial += memoryLayer`
			`layerCount++`
			`}`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`}`
			`}`

only count output tensors 2024-04-25 21:41:50 +00:00			`if gpus[0].Library != "metal" \|\| !opts.UseMMap {`
			`// memory was not preallocated for output tensors`
			`memoryRequiredTotal += memoryLayerOutput`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`}`

count memory up to NumGPU 2024-05-10 21:40:37 +00:00			`if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) \|\| (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`layerCount = int(ggml.KV().BlockCount()) + 1`
			`memoryRequiredPartial = memoryRequiredTotal`
			`}`

			`memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv`

			`slog.Info(`
			`"offload to gpu",`
			`slog.Group(`
			`"layers",`
count memory up to NumGPU 2024-05-10 21:40:37 +00:00			`// requested number of layers to offload`
			`"requested", opts.NumGPU,`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`// estimated number of layers that can be offloaded`
count memory up to NumGPU 2024-05-10 21:40:37 +00:00			`"real", layerCount,`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`),`
			`slog.Group(`
			`"memory",`
			`// memory available for offloading`
			`"available", format.HumanBytes2(memoryAvailable),`
			`slog.Group(`
			`"required",`
			`// memory required for full offloading`
			`"full", format.HumanBytes2(memoryRequiredTotal),`
			`// memory required to offload layers.estimate layers`
			`"partial", format.HumanBytes2(memoryRequiredPartial),`
			`// memory of KV cache`
			`"kv", format.HumanBytes2(kv),`
			`),`
			`slog.Group(`
			`"weights",`
			`// memory of the weights`
			`"total", format.HumanBytes2(memoryWeights),`
			`// memory of repeating layers`
			`"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),`
			`// memory of non-repeating layers`
			`"nonrepeating", format.HumanBytes2(memoryLayerOutput),`
			`),`
			`slog.Group(`
			`"graph",`
			`// memory of graph when fully offloaded`
			`"full", format.HumanBytes2(graphFullOffload),`
			`// memory of graph when not fully offloaded`
			`"partial", format.HumanBytes2(graphPartialOffload),`
			`),`
			`),`
			`)`
Record GPU usage information This records more GPU usage information for eventual UX inclusion. 2024-05-04 16:15:31 +00:00			`if gpus[0].Library == "cpu" {`
			`return 0, 0, memoryRequiredTotal`
			`}`
			`if memoryRequiredPartial > memoryAvailable {`
			`slog.Debug("insufficient VRAM to load any model layers")`
			`return 0, 0, memoryRequiredTotal`
			`}`

			`return layerCount, memoryRequiredPartial, memoryRequiredTotal`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`}`