Rename gpu package discover (#7143)

Cleaning up go package naming
2024-10-16 17:45:00 -07:00 · 2024-10-16 17:45:00 -07:00 · 05cd82ef94
commit 05cd82ef94
parent 7d6eb0d4c3
33 changed files with 94 additions and 94 deletions
--- a/discover/amd_common.go
+++ b/discover/amd_common.go
@ -1,6 +1,6 @@
 //go:build linux || windows

-package gpu
+package discover

 import (
 	"errors"
--- a/discover/amd_hip_windows.go
+++ b/discover/amd_hip_windows.go
@ -1,4 +1,4 @@
-package gpu
+package discover

 import (
 	"errors"
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@ -1,4 +1,4 @@
-package gpu
+package discover

 import (
 	"bufio"
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@ -1,4 +1,4 @@
-package gpu
+package discover

 import (
 	"bytes"
--- a/discover/cpu_common.go
+++ b/discover/cpu_common.go
@ -1,4 +1,4 @@
-package gpu
+package discover

 import (
 	"os"
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@ -1,6 +1,6 @@
 //go:build linux || windows

-package gpu
+package discover

 import (
 	"log/slog"
--- a/discover/gpu.go
+++ b/discover/gpu.go
@ -1,6 +1,6 @@
 //go:build linux || windows

-package gpu
+package discover

 /*
 #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
--- a/discover/gpu_darwin.go
+++ b/discover/gpu_darwin.go
@ -1,6 +1,6 @@
 //go:build darwin

-package gpu
+package discover

 /*
 #cgo CFLAGS: -x objective-c
--- a/discover/gpu_info.h
+++ b/discover/gpu_info.h
--- a/discover/gpu_info_cudart.c
+++ b/discover/gpu_info_cudart.c
--- a/discover/gpu_info_cudart.h
+++ b/discover/gpu_info_cudart.h
--- a/discover/gpu_info_darwin.h
+++ b/discover/gpu_info_darwin.h
--- a/discover/gpu_info_darwin.m
+++ b/discover/gpu_info_darwin.m
--- a/discover/gpu_info_nvcuda.c
+++ b/discover/gpu_info_nvcuda.c
--- a/discover/gpu_info_nvcuda.h
+++ b/discover/gpu_info_nvcuda.h
--- a/discover/gpu_info_nvml.c
+++ b/discover/gpu_info_nvml.c
--- a/discover/gpu_info_nvml.h
+++ b/discover/gpu_info_nvml.h
--- a/discover/gpu_info_oneapi.c
+++ b/discover/gpu_info_oneapi.c
--- a/discover/gpu_info_oneapi.h
+++ b/discover/gpu_info_oneapi.h
--- a/discover/gpu_linux.go
+++ b/discover/gpu_linux.go
@ -1,4 +1,4 @@
-package gpu
+package discover

 import (
 	"bufio"
--- a/discover/gpu_oneapi.go
+++ b/discover/gpu_oneapi.go
@ -1,6 +1,6 @@
 //go:build linux || windows

-package gpu
+package discover

 import (
 	"log/slog"
--- a/discover/gpu_test.go
+++ b/discover/gpu_test.go
@ -1,4 +1,4 @@
-package gpu
+package discover

 import (
 	"runtime"
--- a/discover/gpu_windows.go
+++ b/discover/gpu_windows.go
@ -1,4 +1,4 @@
-package gpu
+package discover

 import (
 	"fmt"
--- a/discover/gpu_windows_test.go
+++ b/discover/gpu_windows_test.go
@ -1,4 +1,4 @@
-package gpu
+package discover

 import "testing"

--- a/discover/types.go
+++ b/discover/types.go
@ -1,4 +1,4 @@
-package gpu
+package discover

 import (
 	"fmt"
--- a/llm/memory.go
+++ b/llm/memory.go
@ -7,13 +7,13 @@ import (
 	"strings"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
 )

 // This algorithm looks for a complete fit to determine if we need to unload other models
-func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
+func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
@ -67,7 +67,7 @@ type MemoryEstimate struct {

 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
+func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
 	// Graph size for a partial offload, applies to all GPUs
 	var graphPartialOffload uint64

@ -157,7 +157,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	gpuAllocations := make([]uint64, len(gpus))
 	type gs struct {
 		i int
-		g *gpu.GpuInfo
+		g *discover.GpuInfo
 	}
 	gpusWithSpace := []gs{}
 	for i := range gpus {
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@ -10,7 +10,7 @@ import (
 	"github.com/stretchr/testify/require"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/gpu"
+	"github.com/ollama/ollama/discover"
 )

 func TestEstimateGPULayers(t *testing.T) {
@ -50,7 +50,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	}

 	// Simple CPU scenario
-	gpus := []gpu.GpuInfo{
+	gpus := []discover.GpuInfo{
 		{
 			Library: "cpu",
 		},
@ -72,7 +72,7 @@ func TestEstimateGPULayers(t *testing.T) {

 	// Dual CUDA scenario with assymetry
 	gpuMinimumMemory := uint64(2048)
-	gpus = []gpu.GpuInfo{
+	gpus = []discover.GpuInfo{
 		{
 			Library:       "cuda",
 			MinimumMemory: gpuMinimumMemory,
--- a/llm/server.go
+++ b/llm/server.go
@ -26,9 +26,9 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/build"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/runners"
 )
@ -61,8 +61,8 @@ type llmServer struct {
 	estimate    MemoryEstimate
 	totalLayers uint64
 	// gpuCount     int
-	gpus         gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
-	loadDuration time.Duration   // Record how long it took the model to load
+	gpus         discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
+	loadDuration time.Duration        // Record how long it took the model to load
 	loadProgress float32

 	sem *semaphore.Weighted
@ -90,7 +90,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {

 // NewLlamaServer will run a server for the given GPUs
 // The gpu list must be a single family.
-func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
+func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	var err error
 	var cpuRunner string
 	var estimate MemoryEstimate
@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	var systemFreeMemory uint64
 	var systemSwapFreeMemory uint64

-	systemInfo := gpu.GetSystemInfo()
+	systemInfo := discover.GetSystemInfo()
 	systemTotalMemory = systemInfo.System.TotalMemory
 	systemFreeMemory = systemInfo.System.FreeMemory
 	systemSwapFreeMemory = systemInfo.System.FreeSwap
@ -106,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr

 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
 	if opts.NumGPU == 0 {
-		gpus = gpu.GetCPUInfo()
+		gpus = discover.GetCPUInfo()
 	}
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 		cpuRunner = runners.ServerForCpu()
@ -122,7 +122,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			cpuRunner = runners.ServerForCpu()
-			gpus = gpu.GetCPUInfo()
+			gpus = discover.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
 		}
@ -281,7 +281,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}

 		if strings.HasPrefix(servers[i], "cpu") {
-			gpus = gpu.GetCPUInfo()
+			gpus = discover.GetCPUInfo()
 		}

 		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
--- a/runners/common.go
+++ b/runners/common.go
@ -18,8 +18,8 @@ import (

 	"golang.org/x/sync/errgroup"

+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/gpu"
 )

 const (
@ -301,11 +301,11 @@ func GetAvailableServers(payloadsDir string) map[string]string {
 // serversForGpu returns a list of compatible servers give the provided GPU
 // info, ordered by performance. assumes Init() has been called
 // TODO - switch to metadata based mapping
-func ServersForGpu(info gpu.GpuInfo) []string {
+func ServersForGpu(info discover.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := GetAvailableServers(runnersDir)
 	requested := info.Library
-	if info.Variant != gpu.CPUCapabilityNone.String() {
+	if info.Variant != discover.CPUCapabilityNone.String() {
 		requested += "_" + info.Variant
 	}

@ -341,12 +341,12 @@ func ServersForGpu(info gpu.GpuInfo) []string {
 	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
 		// Load up the best CPU variant if not primary requested
 		if info.Library != "cpu" {
-			variant := gpu.GetCPUCapability()
+			variant := discover.GetCPUCapability()
 			// If no variant, then we fall back to default
 			// If we have a variant, try that if we find an exact match
 			// Attempting to run the wrong CPU instructions will panic the
 			// process
-			if variant != gpu.CPUCapabilityNone {
+			if variant != discover.CPUCapabilityNone {
 				for cmp := range availableServers {
 					if cmp == "cpu_"+variant.String() {
 						servers = append(servers, cmp)
@ -371,9 +371,9 @@ func ServerForCpu() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		return "metal"
 	}
-	variant := gpu.GetCPUCapability()
+	variant := discover.GetCPUCapability()
 	availableServers := GetAvailableServers(runnersDir)
-	if variant != gpu.CPUCapabilityNone {
+	if variant != discover.CPUCapabilityNone {
 		for cmp := range availableServers {
 			if cmp == "cpu_"+variant.String() {
 				return cmp
--- a/server/routes.go
+++ b/server/routes.go
@ -27,8 +27,8 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/build"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
@ -1235,7 +1235,7 @@ func Serve(ln net.Listener) error {

 	// At startup we retrieve GPU information so we can get log messages before loading a model
 	// This will log warnings to the log in case we have problems with detected GPUs
-	gpus := gpu.GetGPUInfo()
+	gpus := discover.GetGPUInfo()
 	gpus.LogDetails()

 	err = srvr.Serve(ln)
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@ -15,7 +15,7 @@ import (
 	"github.com/google/go-cmp/cmp"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/gpu"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/llm"
 )

@ -41,8 +41,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
 	return
 }

-func newMockServer(mock *mockRunner) func(gpu.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
-	return func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
+	return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return mock, nil
 	}
 }
@ -69,10 +69,10 @@ func TestGenerateChat(t *testing.T) {
 			unloadedCh:    make(chan any, 1),
 			loaded:        make(map[string]*runnerRef),
 			newServerFn:   newMockServer(&mock),
-			getGpuFn:      gpu.GetGPUInfo,
-			getCpuFn:      gpu.GetCPUInfo,
+			getGpuFn:      discover.GetGPUInfo,
+			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
+			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
@ -367,10 +367,10 @@ func TestGenerate(t *testing.T) {
 			unloadedCh:    make(chan any, 1),
 			loaded:        make(map[string]*runnerRef),
 			newServerFn:   newMockServer(&mock),
-			getGpuFn:      gpu.GetGPUInfo,
-			getCpuFn:      gpu.GetCPUInfo,
+			getGpuFn:      discover.GetGPUInfo,
+			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
+			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
--- a/server/sched.go
+++ b/server/sched.go
@ -15,9 +15,9 @@ import (
 	"time"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 )

@ -41,10 +41,10 @@ type Scheduler struct {
 	loaded   map[string]*runnerRef
 	loadedMu sync.Mutex

-	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int)
-	newServerFn  func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
-	getGpuFn     func() gpu.GpuInfoList
-	getCpuFn     func() gpu.GpuInfoList
+	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
+	newServerFn  func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
+	getGpuFn     func() discover.GpuInfoList
+	getCpuFn     func() discover.GpuInfoList
 	reschedDelay time.Duration
 }

@ -69,8 +69,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
 		unloadedCh:    make(chan interface{}, maxQueue),
 		loaded:        make(map[string]*runnerRef),
 		newServerFn:   llm.NewLlamaServer,
-		getGpuFn:      gpu.GetGPUInfo,
-		getCpuFn:      gpu.GetCPUInfo,
+		getGpuFn:      discover.GetGPUInfo,
+		getCpuFn:      discover.GetCPUInfo,
 		reschedDelay:  250 * time.Millisecond,
 	}
 	sched.loadFn = sched.load
@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				} else {
 					// Either no models are loaded or below envconfig.MaxRunners
 					// Get a refreshed GPU list
-					var gpus gpu.GpuInfoList
+					var gpus discover.GpuInfoList
 					if pending.opts.NumGPU == 0 {
 						gpus = s.getCpuFn()
 					} else {
@ -409,7 +409,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 	}()
 }

-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
+func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
 	if numParallel < 1 {
 		numParallel = 1
 	}
@ -470,7 +470,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
 	}()
 }

-func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
+func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
 	type predKey struct {
 		Library string
 		ID      string
@ -513,8 +513,8 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 // to avoid scheduling another model on the same GPU(s) that haven't stabilized.
 // This routine returns the set of GPUs that do not have an active loading model.
 // If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
-func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
-	ret := append(gpu.GpuInfoList{}, allGpus...)
+func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
+	ret := append(discover.GpuInfoList{}, allGpus...)
 	s.loadedMu.Lock()
 	defer s.loadedMu.Unlock()
 	for _, runner := range s.loaded {
@ -541,8 +541,8 @@ type runnerRef struct {
 	// unloading bool      // set to true when we are trying to unload the runner

 	llama          llm.LlamaServer
-	loading        bool            // True only during initial load, then false forever
-	gpus           gpu.GpuInfoList // Recorded at time of provisioning
+	loading        bool                 // True only during initial load, then false forever
+	gpus           discover.GpuInfoList // Recorded at time of provisioning
 	estimatedVRAM  uint64
 	estimatedTotal uint64

@ -630,7 +630,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 	start := time.Now()

 	// Establish a baseline before we unload
-	gpusBefore := gpu.GetGPUInfo()
+	gpusBefore := discover.GetGPUInfo()
 	var totalMemoryBefore, freeMemoryBefore uint64
 	for _, gpu := range gpusBefore {
 		totalMemoryBefore += gpu.TotalMemory
@ -648,7 +648,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 			}

 			// Query GPUs, look for free to go back up
-			gpusNow := gpu.GetGPUInfo()
+			gpusNow := discover.GetGPUInfo()
 			var totalMemoryNow, freeMemoryNow uint64
 			for _, gpu := range gpusNow {
 				totalMemoryNow += gpu.TotalMemory
@ -685,7 +685,7 @@ func (a ByDuration) Less(i, j int) bool {
 // If the model can not be fit fully within the available GPU(s) nil is returned
 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
 // opts.NumCtx accordingly
-func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
 	var estimatedVRAM uint64

 	var numParallelToTry []int
@ -698,22 +698,22 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL

 	for _, gl := range gpus.ByLibrary() {
 		var ok bool
-		sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...)
+		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)

 		// TODO - potentially sort by performance capability, existing models loaded, etc.
 		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
 		// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
-		sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
+		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))

 		// First attempt to fit the model into a single GPU
 		for _, p := range numParallelToTry {
 			req.opts.NumCtx = req.origNumCtx * p
 			if !envconfig.SchedSpread() {
 				for _, g := range sgl {
-					if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 						*numParallel = p
-						return []gpu.GpuInfo{g}
+						return []discover.GpuInfo{g}
 					}
 				}
 			}
@ -737,7 +737,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 }

 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
-func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
 	if *numParallel <= 0 {
 		*numParallel = 1
 		req.opts.NumCtx = req.origNumCtx
@ -822,7 +822,7 @@ func (s *Scheduler) expireRunner(model *Model) {

 // If other runners are loaded, make sure the pending request will fit in system memory
 // If not, pick a runner to unload, else return nil and the request can be loaded
-func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
 	slog.Debug("evaluating if CPU model load will fit in available system memory")
 	estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
 	if estimate.TotalSize <= gpus[0].FreeMemory {
--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -13,8 +13,8 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/app/lifecycle"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 )

@ -47,10 +47,10 @@ func TestLoad(t *testing.T) {
 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
 	}
 	// Fail to load model first
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return nil, errors.New("something failed to load model blah")
 	}
-	gpus := gpu.GpuInfoList{}
+	gpus := discover.GpuInfoList{}
 	s.load(req, ggml, gpus, 0)
 	require.Empty(t, req.successCh)
 	require.Len(t, req.errCh, 1)
@ -61,7 +61,7 @@ func TestLoad(t *testing.T) {
 	require.Contains(t, err.Error(), "this model may be incompatible")

 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return server, nil
 	}
 	s.load(req, ggml, gpus, 0)
@ -102,7 +102,7 @@ type reqBundle struct {
 	ggml    *llm.GGML
 }

-func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 	return scenario.srv, nil
 }

@ -151,18 +151,18 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	return b
 }

-func getGpuFn() gpu.GpuInfoList {
-	g := gpu.GpuInfo{Library: "metal"}
+func getGpuFn() discover.GpuInfoList {
+	g := discover.GpuInfo{Library: "metal"}
 	g.TotalMemory = 24 * format.GigaByte
 	g.FreeMemory = 12 * format.GigaByte
-	return []gpu.GpuInfo{g}
+	return []discover.GpuInfo{g}
 }

-func getCpuFn() gpu.GpuInfoList {
-	g := gpu.GpuInfo{Library: "cpu"}
+func getCpuFn() discover.GpuInfoList {
+	g := discover.GpuInfo{Library: "cpu"}
 	g.TotalMemory = 32 * format.GigaByte
 	g.FreeMemory = 26 * format.GigaByte
-	return []gpu.GpuInfo{g}
+	return []discover.GpuInfo{g}
 }

 func TestRequestsSameModelSameRequest(t *testing.T) {
@ -420,9 +420,9 @@ func TestExpireRunner(t *testing.T) {
 	}

 	var ggml *llm.GGML
-	gpus := gpu.GpuInfoList{}
+	gpus := discover.GpuInfoList{}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return server, nil
 	}
 	s.load(req, ggml, gpus, 0)
@ -460,11 +460,11 @@ func TestPrematureExpired(t *testing.T) {
 	// Same model, same request
 	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
 	s := InitScheduler(ctx)
-	s.getGpuFn = func() gpu.GpuInfoList {
-		g := gpu.GpuInfo{Library: "metal"}
+	s.getGpuFn = func() discover.GpuInfoList {
+		g := discover.GpuInfo{Library: "metal"}
 		g.TotalMemory = 24 * format.GigaByte
 		g.FreeMemory = 12 * format.GigaByte
-		return []gpu.GpuInfo{g}
+		return []discover.GpuInfo{g}
 	}
 	s.newServerFn = scenario1a.newServer
 	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
@ -530,7 +530,7 @@ func TestUseLoadedRunner(t *testing.T) {
 func TestUpdateFreeSpace(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
-	gpus := gpu.GpuInfoList{
+	gpus := discover.GpuInfoList{
 		{
 			Library: "a",
 			ID:      "1",
@ -563,7 +563,7 @@ func TestUpdateFreeSpace(t *testing.T) {
 func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
-	gpus := gpu.GpuInfoList{
+	gpus := discover.GpuInfoList{
 		{
 			Library: "cuda",
 			ID:      "0",
@ -573,7 +573,7 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 			ID:      "1",
 		},
 	}
-	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
+	r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}

 	s := InitScheduler(ctx)
 	s.loadedMu.Lock()
@ -584,12 +584,12 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 	require.Len(t, tmp, 1)
 	require.Equal(t, "1", tmp[0].ID)

-	r1.gpus = gpu.GpuInfoList{gpus[1]}
+	r1.gpus = discover.GpuInfoList{gpus[1]}
 	tmp = s.filterGPUsWithoutLoadingModels(gpus)
 	require.Len(t, tmp, 1)
 	require.Equal(t, "0", tmp[0].ID)

-	r1.gpus = gpu.GpuInfoList{}
+	r1.gpus = discover.GpuInfoList{}
 	tmp = s.filterGPUsWithoutLoadingModels(gpus)
 	require.Len(t, tmp, 2)
 }
@ -715,9 +715,9 @@ func TestHomogeneousGPUs(t *testing.T) {
 	defer done()
 	s := InitScheduler(ctx)

-	s.getGpuFn = func() gpu.GpuInfoList {
+	s.getGpuFn = func() discover.GpuInfoList {
 		// Set memory values to require the model to be spread
-		gpus := []gpu.GpuInfo{
+		gpus := []discover.GpuInfo{
 			{Library: "cuda"},
 			{Library: "rocm"},
 		}
@ -729,7 +729,7 @@ func TestHomogeneousGPUs(t *testing.T) {
 	}
 	s.getCpuFn = getCpuFn
 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		require.Len(t, gpus, 1)
 		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
 	}