ollama/discover/types.go

package discover

import (
	"fmt"
	"log/slog"

	"github.com/ollama/ollama/format"
)

type memInfo struct {
	TotalMemory uint64 `json:"total_memory,omitempty"`
	FreeMemory  uint64 `json:"free_memory,omitempty"`
	FreeSwap    uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
}

// Beginning of an `ollama info` command
type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
	memInfo
	Library string `json:"library,omitempty"`

	// Optional variant to select (e.g. versions, cpu feature flags)
	Variant string `json:"variant"`

	// MinimumMemory represents the minimum memory required to use the GPU
	MinimumMemory uint64 `json:"-"`

	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
	DependencyPath []string `json:"lib_path,omitempty"`

	// Extra environment variables specific to the GPU as list of [key,value]
	EnvWorkarounds [][2]string `json:"envs,omitempty"`

	// Set to true if we can NOT reliably discover FreeMemory.  A value of true indicates
	// the FreeMemory is best effort, and may over or under report actual memory usage
	// False indicates FreeMemory can generally be trusted on this GPU
	UnreliableFreeMemory bool

	// GPU information
	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
	Name    string `json:"name"`    // user friendly name if available
	Compute string `json:"compute"` // Compute Capability or gfx

	// Driver Information - TODO no need to put this on each GPU
	DriverMajor int `json:"driver_major,omitempty"`
	DriverMinor int `json:"driver_minor,omitempty"`

	// TODO other performance capability info to help in scheduling decisions
}

type CPUInfo struct {
	GpuInfo
	CPUs []CPU
}

// CPU type represents a CPU Package occupying a socket
type CPU struct {
	ID                  string `cpuinfo:"processor"`
	VendorID            string `cpuinfo:"vendor_id"`
	ModelName           string `cpuinfo:"model name"`
	CoreCount           int
	EfficiencyCoreCount int // Performance = CoreCount - Efficiency
	ThreadCount         int
}

type CudaGPUInfo struct {
	GpuInfo
	OSOverhead   uint64 // Memory overhead between the driver library and management library
	index        int    //nolint:unused,nolintlint
	computeMajor int    //nolint:unused,nolintlint
	computeMinor int    //nolint:unused,nolintlint
}
type CudaGPUInfoList []CudaGPUInfo

type RocmGPUInfo struct {
	GpuInfo
	usedFilepath string //nolint:unused,nolintlint
	index        int    //nolint:unused,nolintlint
}
type RocmGPUInfoList []RocmGPUInfo

type OneapiGPUInfo struct {
	GpuInfo
	driverIndex int //nolint:unused,nolintlint
	gpuIndex    int //nolint:unused,nolintlint
}
type OneapiGPUInfoList []OneapiGPUInfo

type GpuInfoList []GpuInfo

type UnsupportedGPUInfo struct {
	GpuInfo
	Reason string `json:"reason"`
}

// Split up the set of gpu info's by Library and variant
func (l GpuInfoList) ByLibrary() []GpuInfoList {
	resp := []GpuInfoList{}
	libs := []string{}
	for _, info := range l {
		found := false
		requested := info.Library
		if info.Variant != CPUCapabilityNone.String() {
			requested += "_" + info.Variant
		}
		for i, lib := range libs {
			if lib == requested {
				resp[i] = append(resp[i], info)
				found = true
				break
			}
		}
		if !found {
			libs = append(libs, requested)
			resp = append(resp, []GpuInfo{info})
		}
	}
	return resp
}

// Report the GPU information into the log an Info level
func (l GpuInfoList) LogDetails() {
	for _, g := range l {
		slog.Info("inference compute",
			"id", g.ID,
			"library", g.Library,
			"variant", g.Variant,
			"compute", g.Compute,
			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
			"name", g.Name,
			"total", format.HumanBytes2(g.TotalMemory),
			"available", format.HumanBytes2(g.FreeMemory),
		)
	}
}

// Sort by Free Space
type ByFreeMemory []GpuInfo

func (a ByFreeMemory) Len() int           { return len(a) }
func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }

type CPUCapability uint32

// Override at build time when building base GPU runners
var GPURunnerCPUCapability = CPUCapabilityAVX

const (
	CPUCapabilityNone CPUCapability = iota
	CPUCapabilityAVX
	CPUCapabilityAVX2
	// TODO AVX512
)

func (c CPUCapability) String() string {
	switch c {
	case CPUCapabilityAVX:
		return "avx"
	case CPUCapabilityAVX2:
		return "avx2"
	default:
		return "no vector extensions"
	}
}

type SystemInfo struct {
	System          CPUInfo              `json:"system"`
	GPUs            []GpuInfo            `json:"gpus"`
	UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
	DiscoveryErrors []string             `json:"discovery_errors"`
}

// Return the optimal number of threads to use for inference
func (si SystemInfo) GetOptimalThreadCount() int {
	if len(si.System.CPUs) == 0 {
		return 0
	}

	coreCount := 0
	for _, c := range si.System.CPUs {
		coreCount += c.CoreCount - c.EfficiencyCoreCount
	}

	return coreCount
}
Rename gpu package discover (#7143) Cleaning up go package naming 2024-10-17 00:45:00 +00:00			`package discover`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00
Record more GPU information This cleans up the logging for GPU discovery a bit, and can serve as a foundation to report GPU information in a future UX. 2024-05-07 21:54:26 +00:00			`import (`
			`"fmt"`
			`"log/slog"`

			`"github.com/ollama/ollama/format"`
			`)`

Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`type memInfo struct {`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			TotalMemory uint64 `json:"total_memory,omitempty"`
			FreeMemory uint64 `json:"free_memory,omitempty"`
Discovery CPU details for default thread selection (#6264) On windows, detect large multi-socket systems and reduce to the number of cores in one socket for best performance 2024-10-15 18:36:08 +00:00			FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`}`

			// Beginning of an `ollama info` command
Discovery CPU details for default thread selection (#6264) On windows, detect large multi-socket systems and reduce to the number of cores in one socket for best performance 2024-10-15 18:36:08 +00:00			`type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`memInfo`
			Library string `json:"library,omitempty"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00
Support multiple variants for a given llm lib type In some cases we may want multiple variants for a given GPU type or CPU. This adds logic to have an optional Variant which we can use to select an optimal library, but also allows us to try multiple variants in case some fail to load. This can be useful for scenarios such as ROCm v5 vs v6 incompatibility or potentially CPU features. 2024-01-05 20:13:08 +00:00			`// Optional variant to select (e.g. versions, cpu feature flags)`
Add Jetson cuda variants for arm This adds new variants for arm64 specific to Jetson platforms 2024-05-31 04:54:07 +00:00			Variant string `json:"variant"`
Support multiple variants for a given llm lib type In some cases we may want multiple variants for a given GPU type or CPU. This adds logic to have an optional Variant which we can use to select an optimal library, but also allows us to try multiple variants in case some fail to load. This can be useful for scenarios such as ROCm v5 vs v6 incompatibility or potentially CPU features. 2024-01-05 20:13:08 +00:00
update memory calcualtions count each layer independently when deciding gpu offloading 2024-03-18 09:45:22 +00:00			`// MinimumMemory represents the minimum memory required to use the GPU`
partial offloading 2024-04-05 21:50:38 +00:00			MinimumMemory uint64 `json:"-"`
update memory calcualtions count each layer independently when deciding gpu offloading 2024-03-18 09:45:22 +00:00
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly`
Jetpack support for Go server (#7217) This adds support for the Jetson JetPack variants into the Go runner 2024-11-12 18:31:52 +00:00			DependencyPath []string `json:"lib_path,omitempty"`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00
Workaround gfx900 SDMA bugs Implement support for GPU env var workarounds, and leverage this for the Vega RX 56 which needs HSA_ENABLE_SDMA=0 set to work properly 2024-05-31 23:15:21 +00:00			`// Extra environment variables specific to the GPU as list of [key,value]`
			EnvWorkarounds [][2]string `json:"envs,omitempty"`

Disable concurrency for AMD + Windows Until ROCm v6.2 ships, we wont be able to get accurate free memory reporting on windows, which makes automatic concurrency too risky. Users can still opt-in but will need to pay attention to model sizes otherwise they may thrash/page VRAM or cause OOM crashes. All other platforms and GPUs have accurate VRAM reporting wired up now, so we can turn on concurrency by default. 2024-06-19 20:35:38 +00:00			`// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates`
			`// the FreeMemory is best effort, and may over or under report actual memory usage`
			`// False indicates FreeMemory can generally be trusted on this GPU`
			`UnreliableFreeMemory bool`

Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`// GPU information`
Record more GPU information This cleans up the logging for GPU discovery a bit, and can serve as a foundation to report GPU information in a future UX. 2024-05-07 21:54:26 +00:00			ID string `json:"gpu_id"` // string to use for selection of this specific GPU
			Name string `json:"name"` // user friendly name if available
			Compute string `json:"compute"` // Compute Capability or gfx

			`// Driver Information - TODO no need to put this on each GPU`
			DriverMajor int `json:"driver_major,omitempty"`
			DriverMinor int `json:"driver_minor,omitempty"`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00
			`// TODO other performance capability info to help in scheduling decisions`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
Detect AMD GPU info via sysfs and block old cards This wires up some new logic to start using sysfs to discover AMD GPU information and detects old cards we can't yet support so we can fallback to CPU mode. 2024-02-11 22:50:06 +00:00
Refine GPU discovery to bootstrap once Now that we call the GPU discovery routines many times to update memory, this splits initial discovery from free memory updating. 2024-05-15 22:13:16 +00:00			`type CPUInfo struct {`
			`GpuInfo`
Discovery CPU details for default thread selection (#6264) On windows, detect large multi-socket systems and reduce to the number of cores in one socket for best performance 2024-10-15 18:36:08 +00:00			`CPUs []CPU`
			`}`

			`// CPU type represents a CPU Package occupying a socket`
			`type CPU struct {`
			ID string `cpuinfo:"processor"`
			VendorID string `cpuinfo:"vendor_id"`
			ModelName string `cpuinfo:"model name"`
			`CoreCount int`
			`EfficiencyCoreCount int // Performance = CoreCount - Efficiency`
			`ThreadCount int`
Refine GPU discovery to bootstrap once Now that we call the GPU discovery routines many times to update memory, this splits initial discovery from free memory updating. 2024-05-15 22:13:16 +00:00			`}`

			`type CudaGPUInfo struct {`
			`GpuInfo`
Add cuda v12 variant and selection logic Based on compute capability and driver version, pick v12 or v11 cuda variants. 2024-06-14 03:46:14 +00:00			`OSOverhead uint64 // Memory overhead between the driver library and management library`
			`index int //nolint:unused,nolintlint`
			`computeMajor int //nolint:unused,nolintlint`
			`computeMinor int //nolint:unused,nolintlint`
Refine GPU discovery to bootstrap once Now that we call the GPU discovery routines many times to update memory, this splits initial discovery from free memory updating. 2024-05-15 22:13:16 +00:00			`}`
			`type CudaGPUInfoList []CudaGPUInfo`

			`type RocmGPUInfo struct {`
			`GpuInfo`
review comments and coverage 2024-06-05 19:07:20 +00:00			`usedFilepath string //nolint:unused,nolintlint`
			`index int //nolint:unused,nolintlint`
Refine GPU discovery to bootstrap once Now that we call the GPU discovery routines many times to update memory, this splits initial discovery from free memory updating. 2024-05-15 22:13:16 +00:00			`}`
			`type RocmGPUInfoList []RocmGPUInfo`

			`type OneapiGPUInfo struct {`
			`GpuInfo`
review comments and coverage 2024-06-05 19:07:20 +00:00			`driverIndex int //nolint:unused,nolintlint`
			`gpuIndex int //nolint:unused,nolintlint`
Refine GPU discovery to bootstrap once Now that we call the GPU discovery routines many times to update memory, this splits initial discovery from free memory updating. 2024-05-15 22:13:16 +00:00			`}`
			`type OneapiGPUInfoList []OneapiGPUInfo`

Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`type GpuInfoList []GpuInfo`

Track GPU discovery failure information (#5820) * Expose GPU discovery failure information * Remove exposed API for now 2024-10-14 23:26:45 +00:00			`type UnsupportedGPUInfo struct {`
			`GpuInfo`
			Reason string `json:"reason"`
			`}`

Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`// Split up the set of gpu info's by Library and variant`
			`func (l GpuInfoList) ByLibrary() []GpuInfoList {`
			`resp := []GpuInfoList{}`
			`libs := []string{}`
			`for _, info := range l {`
			`found := false`
			`requested := info.Library`
Add Jetson cuda variants for arm This adds new variants for arm64 specific to Jetson platforms 2024-05-31 04:54:07 +00:00			`if info.Variant != CPUCapabilityNone.String() {`
			`requested += "_" + info.Variant`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`}`
			`for i, lib := range libs {`
			`if lib == requested {`
			`resp[i] = append(resp[i], info)`
			`found = true`
			`break`
			`}`
			`}`
			`if !found {`
gpu: Group GPU Library sets by variant (#6483) The recent cuda variant changes uncovered a bug in ByLibrary which failed to group by common variant for GPU types. 2024-08-23 22:11:56 +00:00			`libs = append(libs, requested)`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`resp = append(resp, []GpuInfo{info})`
			`}`
			`}`
			`return resp`
Detect AMD GPU info via sysfs and block old cards This wires up some new logic to start using sysfs to discover AMD GPU information and detects old cards we can't yet support so we can fallback to CPU mode. 2024-02-11 22:50:06 +00:00			`}`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00
Record more GPU information This cleans up the logging for GPU discovery a bit, and can serve as a foundation to report GPU information in a future UX. 2024-05-07 21:54:26 +00:00			`// Report the GPU information into the log an Info level`
			`func (l GpuInfoList) LogDetails() {`
			`for _, g := range l {`
			`slog.Info("inference compute",`
			`"id", g.ID,`
			`"library", g.Library,`
Report GPU variant in log 2024-06-19 16:36:30 +00:00			`"variant", g.Variant,`
Record more GPU information This cleans up the logging for GPU discovery a bit, and can serve as a foundation to report GPU information in a future UX. 2024-05-07 21:54:26 +00:00			`"compute", g.Compute,`
			`"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),`
			`"name", g.Name,`
			`"total", format.HumanBytes2(g.TotalMemory),`
			`"available", format.HumanBytes2(g.FreeMemory),`
			`)`
			`}`
			`}`

Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`// Sort by Free Space`
			`type ByFreeMemory []GpuInfo`

			`func (a ByFreeMemory) Len() int { return len(a) }`
			`func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }`
			`func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }`
Refine GPU discovery to bootstrap once Now that we call the GPU discovery routines many times to update memory, this splits initial discovery from free memory updating. 2024-05-15 22:13:16 +00:00
			`type CPUCapability uint32`

			`// Override at build time when building base GPU runners`
			`var GPURunnerCPUCapability = CPUCapabilityAVX`

			`const (`
review comments and coverage 2024-06-05 19:07:20 +00:00			`CPUCapabilityNone CPUCapability = iota`
Refine GPU discovery to bootstrap once Now that we call the GPU discovery routines many times to update memory, this splits initial discovery from free memory updating. 2024-05-15 22:13:16 +00:00			`CPUCapabilityAVX`
			`CPUCapabilityAVX2`
			`// TODO AVX512`
			`)`

review comments and coverage 2024-06-05 19:07:20 +00:00			`func (c CPUCapability) String() string {`
Refine GPU discovery to bootstrap once Now that we call the GPU discovery routines many times to update memory, this splits initial discovery from free memory updating. 2024-05-15 22:13:16 +00:00			`switch c {`
			`case CPUCapabilityAVX:`
			`return "avx"`
			`case CPUCapabilityAVX2:`
			`return "avx2"`
			`default:`
review comments and coverage 2024-06-05 19:07:20 +00:00			`return "no vector extensions"`
Refine GPU discovery to bootstrap once Now that we call the GPU discovery routines many times to update memory, this splits initial discovery from free memory updating. 2024-05-15 22:13:16 +00:00			`}`
			`}`
Track GPU discovery failure information (#5820) * Expose GPU discovery failure information * Remove exposed API for now 2024-10-14 23:26:45 +00:00
			`type SystemInfo struct {`
			System CPUInfo `json:"system"`
			GPUs []GpuInfo `json:"gpus"`
			UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
			DiscoveryErrors []string `json:"discovery_errors"`
			`}`
Discovery CPU details for default thread selection (#6264) On windows, detect large multi-socket systems and reduce to the number of cores in one socket for best performance 2024-10-15 18:36:08 +00:00
			`// Return the optimal number of threads to use for inference`
			`func (si SystemInfo) GetOptimalThreadCount() int {`
			`if len(si.System.CPUs) == 0 {`
			`return 0`
			`}`
Refine default thread selection for NUMA systems (#7322) Until we have full NUMA support, this adjusts the default thread selection algorithm to count up the number of performance cores across all sockets. 2024-10-30 22:05:45 +00:00
			`coreCount := 0`
			`for _, c := range si.System.CPUs {`
			`coreCount += c.CoreCount - c.EfficiencyCoreCount`
			`}`

			`return coreCount`
Discovery CPU details for default thread selection (#6264) On windows, detect large multi-socket systems and reduce to the number of cores in one socket for best performance 2024-10-15 18:36:08 +00:00			`}`