diff --git a/gpu/amd_common.go b/discover/amd_common.go similarity index 99% rename from gpu/amd_common.go rename to discover/amd_common.go index 2894ac2c..bf969240 100644 --- a/gpu/amd_common.go +++ b/discover/amd_common.go @@ -1,6 +1,6 @@ //go:build linux || windows -package gpu +package discover import ( "errors" diff --git a/gpu/amd_hip_windows.go b/discover/amd_hip_windows.go similarity index 99% rename from gpu/amd_hip_windows.go rename to discover/amd_hip_windows.go index 2cea2824..12a7af0b 100644 --- a/gpu/amd_hip_windows.go +++ b/discover/amd_hip_windows.go @@ -1,4 +1,4 @@ -package gpu +package discover import ( "errors" diff --git a/gpu/amd_linux.go b/discover/amd_linux.go similarity index 99% rename from gpu/amd_linux.go rename to discover/amd_linux.go index 72dfb4db..dd8e605c 100644 --- a/gpu/amd_linux.go +++ b/discover/amd_linux.go @@ -1,4 +1,4 @@ -package gpu +package discover import ( "bufio" diff --git a/gpu/amd_windows.go b/discover/amd_windows.go similarity index 99% rename from gpu/amd_windows.go rename to discover/amd_windows.go index 4da6b7cc..a3a6e0c3 100644 --- a/gpu/amd_windows.go +++ b/discover/amd_windows.go @@ -1,4 +1,4 @@ -package gpu +package discover import ( "bytes" diff --git a/gpu/cpu_common.go b/discover/cpu_common.go similarity index 97% rename from gpu/cpu_common.go rename to discover/cpu_common.go index 34edcdc5..0faac24a 100644 --- a/gpu/cpu_common.go +++ b/discover/cpu_common.go @@ -1,4 +1,4 @@ -package gpu +package discover import ( "os" diff --git a/gpu/cuda_common.go b/discover/cuda_common.go similarity index 99% rename from gpu/cuda_common.go rename to discover/cuda_common.go index aceec70a..878cee8c 100644 --- a/gpu/cuda_common.go +++ b/discover/cuda_common.go @@ -1,6 +1,6 @@ //go:build linux || windows -package gpu +package discover import ( "log/slog" diff --git a/gpu/gpu.go b/discover/gpu.go similarity index 99% rename from gpu/gpu.go rename to discover/gpu.go index 700c90f9..4f2e0884 100644 --- a/gpu/gpu.go +++ b/discover/gpu.go @@ -1,6 +1,6 @@ //go:build linux || windows -package gpu +package discover /* #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm diff --git a/gpu/gpu_darwin.go b/discover/gpu_darwin.go similarity index 99% rename from gpu/gpu_darwin.go rename to discover/gpu_darwin.go index 530b98c9..d3f0303f 100644 --- a/gpu/gpu_darwin.go +++ b/discover/gpu_darwin.go @@ -1,6 +1,6 @@ //go:build darwin -package gpu +package discover /* #cgo CFLAGS: -x objective-c diff --git a/gpu/gpu_info.h b/discover/gpu_info.h similarity index 100% rename from gpu/gpu_info.h rename to discover/gpu_info.h diff --git a/gpu/gpu_info_cudart.c b/discover/gpu_info_cudart.c similarity index 100% rename from gpu/gpu_info_cudart.c rename to discover/gpu_info_cudart.c diff --git a/gpu/gpu_info_cudart.h b/discover/gpu_info_cudart.h similarity index 100% rename from gpu/gpu_info_cudart.h rename to discover/gpu_info_cudart.h diff --git a/gpu/gpu_info_darwin.h b/discover/gpu_info_darwin.h similarity index 100% rename from gpu/gpu_info_darwin.h rename to discover/gpu_info_darwin.h diff --git a/gpu/gpu_info_darwin.m b/discover/gpu_info_darwin.m similarity index 100% rename from gpu/gpu_info_darwin.m rename to discover/gpu_info_darwin.m diff --git a/gpu/gpu_info_nvcuda.c b/discover/gpu_info_nvcuda.c similarity index 100% rename from gpu/gpu_info_nvcuda.c rename to discover/gpu_info_nvcuda.c diff --git a/gpu/gpu_info_nvcuda.h b/discover/gpu_info_nvcuda.h similarity index 100% rename from gpu/gpu_info_nvcuda.h rename to discover/gpu_info_nvcuda.h diff --git a/gpu/gpu_info_nvml.c b/discover/gpu_info_nvml.c similarity index 100% rename from gpu/gpu_info_nvml.c rename to discover/gpu_info_nvml.c diff --git a/gpu/gpu_info_nvml.h b/discover/gpu_info_nvml.h similarity index 100% rename from gpu/gpu_info_nvml.h rename to discover/gpu_info_nvml.h diff --git a/gpu/gpu_info_oneapi.c b/discover/gpu_info_oneapi.c similarity index 100% rename from gpu/gpu_info_oneapi.c rename to discover/gpu_info_oneapi.c diff --git a/gpu/gpu_info_oneapi.h b/discover/gpu_info_oneapi.h similarity index 100% rename from gpu/gpu_info_oneapi.h rename to discover/gpu_info_oneapi.h diff --git a/gpu/gpu_linux.go b/discover/gpu_linux.go similarity index 99% rename from gpu/gpu_linux.go rename to discover/gpu_linux.go index b15bc2d2..9273baeb 100644 --- a/gpu/gpu_linux.go +++ b/discover/gpu_linux.go @@ -1,4 +1,4 @@ -package gpu +package discover import ( "bufio" diff --git a/gpu/gpu_oneapi.go b/discover/gpu_oneapi.go similarity index 96% rename from gpu/gpu_oneapi.go rename to discover/gpu_oneapi.go index 9864bde5..77941f5b 100644 --- a/gpu/gpu_oneapi.go +++ b/discover/gpu_oneapi.go @@ -1,6 +1,6 @@ //go:build linux || windows -package gpu +package discover import ( "log/slog" diff --git a/gpu/gpu_test.go b/discover/gpu_test.go similarity index 99% rename from gpu/gpu_test.go rename to discover/gpu_test.go index 13a3f544..0c6ef7ba 100644 --- a/gpu/gpu_test.go +++ b/discover/gpu_test.go @@ -1,4 +1,4 @@ -package gpu +package discover import ( "runtime" diff --git a/gpu/gpu_windows.go b/discover/gpu_windows.go similarity index 99% rename from gpu/gpu_windows.go rename to discover/gpu_windows.go index da0dce92..e7665a63 100644 --- a/gpu/gpu_windows.go +++ b/discover/gpu_windows.go @@ -1,4 +1,4 @@ -package gpu +package discover import ( "fmt" diff --git a/gpu/gpu_windows_test.go b/discover/gpu_windows_test.go similarity index 99% rename from gpu/gpu_windows_test.go rename to discover/gpu_windows_test.go index 27e609cf..c4daa7b7 100644 --- a/gpu/gpu_windows_test.go +++ b/discover/gpu_windows_test.go @@ -1,4 +1,4 @@ -package gpu +package discover import "testing" diff --git a/gpu/types.go b/discover/types.go similarity index 99% rename from gpu/types.go rename to discover/types.go index 655c8733..56b168c4 100644 --- a/gpu/types.go +++ b/discover/types.go @@ -1,4 +1,4 @@ -package gpu +package discover import ( "fmt" diff --git a/llm/memory.go b/llm/memory.go index 99db7629..d8dbf0be 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -7,13 +7,13 @@ import ( "strings" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" - "github.com/ollama/ollama/gpu" ) // This algorithm looks for a complete fit to determine if we need to unload other models -func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) { +func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) { // Split up the GPUs by type and try them var estimatedVRAM uint64 for _, gpus := range allGpus.ByLibrary() { @@ -67,7 +67,7 @@ type MemoryEstimate struct { // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size // The GPUs provided must all be the same Library -func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate { +func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate { // Graph size for a partial offload, applies to all GPUs var graphPartialOffload uint64 @@ -157,7 +157,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts gpuAllocations := make([]uint64, len(gpus)) type gs struct { i int - g *gpu.GpuInfo + g *discover.GpuInfo } gpusWithSpace := []gs{} for i := range gpus { diff --git a/llm/memory_test.go b/llm/memory_test.go index ffb14286..73e77d90 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -10,7 +10,7 @@ import ( "github.com/stretchr/testify/require" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/gpu" + "github.com/ollama/ollama/discover" ) func TestEstimateGPULayers(t *testing.T) { @@ -50,7 +50,7 @@ func TestEstimateGPULayers(t *testing.T) { } // Simple CPU scenario - gpus := []gpu.GpuInfo{ + gpus := []discover.GpuInfo{ { Library: "cpu", }, @@ -72,7 +72,7 @@ func TestEstimateGPULayers(t *testing.T) { // Dual CUDA scenario with assymetry gpuMinimumMemory := uint64(2048) - gpus = []gpu.GpuInfo{ + gpus = []discover.GpuInfo{ { Library: "cuda", MinimumMemory: gpuMinimumMemory, diff --git a/llm/server.go b/llm/server.go index 36bcfad2..03aeeb0e 100644 --- a/llm/server.go +++ b/llm/server.go @@ -26,9 +26,9 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/build" + "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" - "github.com/ollama/ollama/gpu" "github.com/ollama/ollama/llama" "github.com/ollama/ollama/runners" ) @@ -61,8 +61,8 @@ type llmServer struct { estimate MemoryEstimate totalLayers uint64 // gpuCount int - gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect - loadDuration time.Duration // Record how long it took the model to load + gpus discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect + loadDuration time.Duration // Record how long it took the model to load loadProgress float32 sem *semaphore.Weighted @@ -90,7 +90,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) { // NewLlamaServer will run a server for the given GPUs // The gpu list must be a single family. -func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { +func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { var err error var cpuRunner string var estimate MemoryEstimate @@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr var systemFreeMemory uint64 var systemSwapFreeMemory uint64 - systemInfo := gpu.GetSystemInfo() + systemInfo := discover.GetSystemInfo() systemTotalMemory = systemInfo.System.TotalMemory systemFreeMemory = systemInfo.System.FreeMemory systemSwapFreeMemory = systemInfo.System.FreeSwap @@ -106,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr // If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info if opts.NumGPU == 0 { - gpus = gpu.GetCPUInfo() + gpus = discover.GetCPUInfo() } if len(gpus) == 1 && gpus[0].Library == "cpu" { cpuRunner = runners.ServerForCpu() @@ -122,7 +122,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr case gpus[0].Library != "metal" && estimate.Layers == 0: // Don't bother loading into the GPU if no layers can fit cpuRunner = runners.ServerForCpu() - gpus = gpu.GetCPUInfo() + gpus = discover.GetCPUInfo() case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": opts.NumGPU = estimate.Layers } @@ -281,7 +281,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } if strings.HasPrefix(servers[i], "cpu") { - gpus = gpu.GetCPUInfo() + gpus = discover.GetCPUInfo() } // Find an availableServers port, retry on each iteration in case the failure was a port conflict race diff --git a/runners/common.go b/runners/common.go index 681c397b..19014d75 100644 --- a/runners/common.go +++ b/runners/common.go @@ -18,8 +18,8 @@ import ( "golang.org/x/sync/errgroup" + "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" - "github.com/ollama/ollama/gpu" ) const ( @@ -301,11 +301,11 @@ func GetAvailableServers(payloadsDir string) map[string]string { // serversForGpu returns a list of compatible servers give the provided GPU // info, ordered by performance. assumes Init() has been called // TODO - switch to metadata based mapping -func ServersForGpu(info gpu.GpuInfo) []string { +func ServersForGpu(info discover.GpuInfo) []string { // glob workDir for files that start with ollama_ availableServers := GetAvailableServers(runnersDir) requested := info.Library - if info.Variant != gpu.CPUCapabilityNone.String() { + if info.Variant != discover.CPUCapabilityNone.String() { requested += "_" + info.Variant } @@ -341,12 +341,12 @@ func ServersForGpu(info gpu.GpuInfo) []string { if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") { // Load up the best CPU variant if not primary requested if info.Library != "cpu" { - variant := gpu.GetCPUCapability() + variant := discover.GetCPUCapability() // If no variant, then we fall back to default // If we have a variant, try that if we find an exact match // Attempting to run the wrong CPU instructions will panic the // process - if variant != gpu.CPUCapabilityNone { + if variant != discover.CPUCapabilityNone { for cmp := range availableServers { if cmp == "cpu_"+variant.String() { servers = append(servers, cmp) @@ -371,9 +371,9 @@ func ServerForCpu() string { if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { return "metal" } - variant := gpu.GetCPUCapability() + variant := discover.GetCPUCapability() availableServers := GetAvailableServers(runnersDir) - if variant != gpu.CPUCapabilityNone { + if variant != discover.CPUCapabilityNone { for cmp := range availableServers { if cmp == "cpu_"+variant.String() { return cmp diff --git a/server/routes.go b/server/routes.go index 23f9dbfd..c2b9b241 100644 --- a/server/routes.go +++ b/server/routes.go @@ -27,8 +27,8 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/build" + "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" - "github.com/ollama/ollama/gpu" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/openai" "github.com/ollama/ollama/parser" @@ -1235,7 +1235,7 @@ func Serve(ln net.Listener) error { // At startup we retrieve GPU information so we can get log messages before loading a model // This will log warnings to the log in case we have problems with detected GPUs - gpus := gpu.GetGPUInfo() + gpus := discover.GetGPUInfo() gpus.LogDetails() err = srvr.Serve(ln) diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index 480b9672..9cadf56a 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -15,7 +15,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/gpu" + "github.com/ollama/ollama/discover" "github.com/ollama/ollama/llm" ) @@ -41,8 +41,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error return } -func newMockServer(mock *mockRunner) func(gpu.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { - return func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { +func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { + return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return mock, nil } } @@ -69,10 +69,10 @@ func TestGenerateChat(t *testing.T) { unloadedCh: make(chan any, 1), loaded: make(map[string]*runnerRef), newServerFn: newMockServer(&mock), - getGpuFn: gpu.GetGPUInfo, - getCpuFn: gpu.GetCPUInfo, + getGpuFn: discover.GetGPUInfo, + getCpuFn: discover.GetCPUInfo, reschedDelay: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) { + loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ @@ -367,10 +367,10 @@ func TestGenerate(t *testing.T) { unloadedCh: make(chan any, 1), loaded: make(map[string]*runnerRef), newServerFn: newMockServer(&mock), - getGpuFn: gpu.GetGPUInfo, - getCpuFn: gpu.GetCPUInfo, + getGpuFn: discover.GetGPUInfo, + getCpuFn: discover.GetCPUInfo, reschedDelay: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) { + loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ diff --git a/server/sched.go b/server/sched.go index 3c8656ad..1409ff07 100644 --- a/server/sched.go +++ b/server/sched.go @@ -15,9 +15,9 @@ import ( "time" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" - "github.com/ollama/ollama/gpu" "github.com/ollama/ollama/llm" ) @@ -41,10 +41,10 @@ type Scheduler struct { loaded map[string]*runnerRef loadedMu sync.Mutex - loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) - newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) - getGpuFn func() gpu.GpuInfoList - getCpuFn func() gpu.GpuInfoList + loadFn func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) + newServerFn func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) + getGpuFn func() discover.GpuInfoList + getCpuFn func() discover.GpuInfoList reschedDelay time.Duration } @@ -69,8 +69,8 @@ func InitScheduler(ctx context.Context) *Scheduler { unloadedCh: make(chan interface{}, maxQueue), loaded: make(map[string]*runnerRef), newServerFn: llm.NewLlamaServer, - getGpuFn: gpu.GetGPUInfo, - getCpuFn: gpu.GetCPUInfo, + getGpuFn: discover.GetGPUInfo, + getCpuFn: discover.GetCPUInfo, reschedDelay: 250 * time.Millisecond, } sched.loadFn = sched.load @@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) { } else { // Either no models are loaded or below envconfig.MaxRunners // Get a refreshed GPU list - var gpus gpu.GpuInfoList + var gpus discover.GpuInfoList if pending.opts.NumGPU == 0 { gpus = s.getCpuFn() } else { @@ -409,7 +409,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm }() } -func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) { +func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) { if numParallel < 1 { numParallel = 1 } @@ -470,7 +470,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, }() } -func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) { +func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) { type predKey struct { Library string ID string @@ -513,8 +513,8 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) { // to avoid scheduling another model on the same GPU(s) that haven't stabilized. // This routine returns the set of GPUs that do not have an active loading model. // If all GPUs have loading models, an empty list will be returned (not a single CPU entry) -func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList { - ret := append(gpu.GpuInfoList{}, allGpus...) +func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList { + ret := append(discover.GpuInfoList{}, allGpus...) s.loadedMu.Lock() defer s.loadedMu.Unlock() for _, runner := range s.loaded { @@ -541,8 +541,8 @@ type runnerRef struct { // unloading bool // set to true when we are trying to unload the runner llama llm.LlamaServer - loading bool // True only during initial load, then false forever - gpus gpu.GpuInfoList // Recorded at time of provisioning + loading bool // True only during initial load, then false forever + gpus discover.GpuInfoList // Recorded at time of provisioning estimatedVRAM uint64 estimatedTotal uint64 @@ -630,7 +630,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} { start := time.Now() // Establish a baseline before we unload - gpusBefore := gpu.GetGPUInfo() + gpusBefore := discover.GetGPUInfo() var totalMemoryBefore, freeMemoryBefore uint64 for _, gpu := range gpusBefore { totalMemoryBefore += gpu.TotalMemory @@ -648,7 +648,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} { } // Query GPUs, look for free to go back up - gpusNow := gpu.GetGPUInfo() + gpusNow := discover.GetGPUInfo() var totalMemoryNow, freeMemoryNow uint64 for _, gpu := range gpusNow { totalMemoryNow += gpu.TotalMemory @@ -685,7 +685,7 @@ func (a ByDuration) Less(i, j int) bool { // If the model can not be fit fully within the available GPU(s) nil is returned // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust // opts.NumCtx accordingly -func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { +func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList { var estimatedVRAM uint64 var numParallelToTry []int @@ -698,22 +698,22 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL for _, gl := range gpus.ByLibrary() { var ok bool - sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...) + sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...) // TODO - potentially sort by performance capability, existing models loaded, etc. // TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them // Note: at present, this will favor more VRAM over faster GPU speed in mixed setups - sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl))) + sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl))) // First attempt to fit the model into a single GPU for _, p := range numParallelToTry { req.opts.NumCtx = req.origNumCtx * p if !envconfig.SchedSpread() { for _, g := range sgl { - if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { + if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) *numParallel = p - return []gpu.GpuInfo{g} + return []discover.GpuInfo{g} } } } @@ -737,7 +737,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL } // If multiple Libraries are detected, pick the Library which loads the most layers for the model -func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { +func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList { if *numParallel <= 0 { *numParallel = 1 req.opts.NumCtx = req.origNumCtx @@ -822,7 +822,7 @@ func (s *Scheduler) expireRunner(model *Model) { // If other runners are loaded, make sure the pending request will fit in system memory // If not, pick a runner to unload, else return nil and the request can be loaded -func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef { +func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef { slog.Debug("evaluating if CPU model load will fit in available system memory") estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts) if estimate.TotalSize <= gpus[0].FreeMemory { diff --git a/server/sched_test.go b/server/sched_test.go index fe5647c5..c999eee0 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -13,8 +13,8 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/app/lifecycle" + "github.com/ollama/ollama/discover" "github.com/ollama/ollama/format" - "github.com/ollama/ollama/gpu" "github.com/ollama/ollama/llm" ) @@ -47,10 +47,10 @@ func TestLoad(t *testing.T) { sessionDuration: &api.Duration{Duration: 2 * time.Second}, } // Fail to load model first - s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return nil, errors.New("something failed to load model blah") } - gpus := gpu.GpuInfoList{} + gpus := discover.GpuInfoList{} s.load(req, ggml, gpus, 0) require.Empty(t, req.successCh) require.Len(t, req.errCh, 1) @@ -61,7 +61,7 @@ func TestLoad(t *testing.T) { require.Contains(t, err.Error(), "this model may be incompatible") server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} - s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return server, nil } s.load(req, ggml, gpus, 0) @@ -102,7 +102,7 @@ type reqBundle struct { ggml *llm.GGML } -func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { +func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return scenario.srv, nil } @@ -151,18 +151,18 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est return b } -func getGpuFn() gpu.GpuInfoList { - g := gpu.GpuInfo{Library: "metal"} +func getGpuFn() discover.GpuInfoList { + g := discover.GpuInfo{Library: "metal"} g.TotalMemory = 24 * format.GigaByte g.FreeMemory = 12 * format.GigaByte - return []gpu.GpuInfo{g} + return []discover.GpuInfo{g} } -func getCpuFn() gpu.GpuInfoList { - g := gpu.GpuInfo{Library: "cpu"} +func getCpuFn() discover.GpuInfoList { + g := discover.GpuInfo{Library: "cpu"} g.TotalMemory = 32 * format.GigaByte g.FreeMemory = 26 * format.GigaByte - return []gpu.GpuInfo{g} + return []discover.GpuInfo{g} } func TestRequestsSameModelSameRequest(t *testing.T) { @@ -420,9 +420,9 @@ func TestExpireRunner(t *testing.T) { } var ggml *llm.GGML - gpus := gpu.GpuInfoList{} + gpus := discover.GpuInfoList{} server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} - s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return server, nil } s.load(req, ggml, gpus, 0) @@ -460,11 +460,11 @@ func TestPrematureExpired(t *testing.T) { // Same model, same request scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil) s := InitScheduler(ctx) - s.getGpuFn = func() gpu.GpuInfoList { - g := gpu.GpuInfo{Library: "metal"} + s.getGpuFn = func() discover.GpuInfoList { + g := discover.GpuInfo{Library: "metal"} g.TotalMemory = 24 * format.GigaByte g.FreeMemory = 12 * format.GigaByte - return []gpu.GpuInfo{g} + return []discover.GpuInfo{g} } s.newServerFn = scenario1a.newServer successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration) @@ -530,7 +530,7 @@ func TestUseLoadedRunner(t *testing.T) { func TestUpdateFreeSpace(t *testing.T) { ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) defer done() - gpus := gpu.GpuInfoList{ + gpus := discover.GpuInfoList{ { Library: "a", ID: "1", @@ -563,7 +563,7 @@ func TestUpdateFreeSpace(t *testing.T) { func TestFilterGPUsWithoutLoadingModels(t *testing.T) { ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) defer done() - gpus := gpu.GpuInfoList{ + gpus := discover.GpuInfoList{ { Library: "cuda", ID: "0", @@ -573,7 +573,7 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) { ID: "1", }, } - r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true} + r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true} s := InitScheduler(ctx) s.loadedMu.Lock() @@ -584,12 +584,12 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) { require.Len(t, tmp, 1) require.Equal(t, "1", tmp[0].ID) - r1.gpus = gpu.GpuInfoList{gpus[1]} + r1.gpus = discover.GpuInfoList{gpus[1]} tmp = s.filterGPUsWithoutLoadingModels(gpus) require.Len(t, tmp, 1) require.Equal(t, "0", tmp[0].ID) - r1.gpus = gpu.GpuInfoList{} + r1.gpus = discover.GpuInfoList{} tmp = s.filterGPUsWithoutLoadingModels(gpus) require.Len(t, tmp, 2) } @@ -715,9 +715,9 @@ func TestHomogeneousGPUs(t *testing.T) { defer done() s := InitScheduler(ctx) - s.getGpuFn = func() gpu.GpuInfoList { + s.getGpuFn = func() discover.GpuInfoList { // Set memory values to require the model to be spread - gpus := []gpu.GpuInfo{ + gpus := []discover.GpuInfo{ {Library: "cuda"}, {Library: "rocm"}, } @@ -729,7 +729,7 @@ func TestHomogeneousGPUs(t *testing.T) { } s.getCpuFn = getCpuFn a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}) - s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { require.Len(t, gpus, 1) return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel) }