Rename gpu package discover (#7143)

Cleaning up go package naming
This commit is contained in:
Daniel Hiltgen 2024-10-16 17:45:00 -07:00 committed by GitHub
parent 7d6eb0d4c3
commit 05cd82ef94
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
33 changed files with 94 additions and 94 deletions

View file

@ -1,6 +1,6 @@
//go:build linux || windows //go:build linux || windows
package gpu package discover
import ( import (
"errors" "errors"

View file

@ -1,4 +1,4 @@
package gpu package discover
import ( import (
"errors" "errors"

View file

@ -1,4 +1,4 @@
package gpu package discover
import ( import (
"bufio" "bufio"

View file

@ -1,4 +1,4 @@
package gpu package discover
import ( import (
"bytes" "bytes"

View file

@ -1,4 +1,4 @@
package gpu package discover
import ( import (
"os" "os"

View file

@ -1,6 +1,6 @@
//go:build linux || windows //go:build linux || windows
package gpu package discover
import ( import (
"log/slog" "log/slog"

View file

@ -1,6 +1,6 @@
//go:build linux || windows //go:build linux || windows
package gpu package discover
/* /*
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm

View file

@ -1,6 +1,6 @@
//go:build darwin //go:build darwin
package gpu package discover
/* /*
#cgo CFLAGS: -x objective-c #cgo CFLAGS: -x objective-c

View file

@ -1,4 +1,4 @@
package gpu package discover
import ( import (
"bufio" "bufio"

View file

@ -1,6 +1,6 @@
//go:build linux || windows //go:build linux || windows
package gpu package discover
import ( import (
"log/slog" "log/slog"

View file

@ -1,4 +1,4 @@
package gpu package discover
import ( import (
"runtime" "runtime"

View file

@ -1,4 +1,4 @@
package gpu package discover
import ( import (
"fmt" "fmt"

View file

@ -1,4 +1,4 @@
package gpu package discover
import "testing" import "testing"

View file

@ -1,4 +1,4 @@
package gpu package discover
import ( import (
"fmt" "fmt"

View file

@ -7,13 +7,13 @@ import (
"strings" "strings"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
) )
// This algorithm looks for a complete fit to determine if we need to unload other models // This algorithm looks for a complete fit to determine if we need to unload other models
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) { func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
// Split up the GPUs by type and try them // Split up the GPUs by type and try them
var estimatedVRAM uint64 var estimatedVRAM uint64
for _, gpus := range allGpus.ByLibrary() { for _, gpus := range allGpus.ByLibrary() {
@ -67,7 +67,7 @@ type MemoryEstimate struct {
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library // The GPUs provided must all be the same Library
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate { func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
// Graph size for a partial offload, applies to all GPUs // Graph size for a partial offload, applies to all GPUs
var graphPartialOffload uint64 var graphPartialOffload uint64
@ -157,7 +157,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
gpuAllocations := make([]uint64, len(gpus)) gpuAllocations := make([]uint64, len(gpus))
type gs struct { type gs struct {
i int i int
g *gpu.GpuInfo g *discover.GpuInfo
} }
gpusWithSpace := []gs{} gpusWithSpace := []gs{}
for i := range gpus { for i := range gpus {

View file

@ -10,7 +10,7 @@ import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/gpu" "github.com/ollama/ollama/discover"
) )
func TestEstimateGPULayers(t *testing.T) { func TestEstimateGPULayers(t *testing.T) {
@ -50,7 +50,7 @@ func TestEstimateGPULayers(t *testing.T) {
} }
// Simple CPU scenario // Simple CPU scenario
gpus := []gpu.GpuInfo{ gpus := []discover.GpuInfo{
{ {
Library: "cpu", Library: "cpu",
}, },
@ -72,7 +72,7 @@ func TestEstimateGPULayers(t *testing.T) {
// Dual CUDA scenario with assymetry // Dual CUDA scenario with assymetry
gpuMinimumMemory := uint64(2048) gpuMinimumMemory := uint64(2048)
gpus = []gpu.GpuInfo{ gpus = []discover.GpuInfo{
{ {
Library: "cuda", Library: "cuda",
MinimumMemory: gpuMinimumMemory, MinimumMemory: gpuMinimumMemory,

View file

@ -26,9 +26,9 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/build" "github.com/ollama/ollama/build"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llama" "github.com/ollama/ollama/llama"
"github.com/ollama/ollama/runners" "github.com/ollama/ollama/runners"
) )
@ -61,8 +61,8 @@ type llmServer struct {
estimate MemoryEstimate estimate MemoryEstimate
totalLayers uint64 totalLayers uint64
// gpuCount int // gpuCount int
gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect gpus discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
loadDuration time.Duration // Record how long it took the model to load loadDuration time.Duration // Record how long it took the model to load
loadProgress float32 loadProgress float32
sem *semaphore.Weighted sem *semaphore.Weighted
@ -90,7 +90,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
// NewLlamaServer will run a server for the given GPUs // NewLlamaServer will run a server for the given GPUs
// The gpu list must be a single family. // The gpu list must be a single family.
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
var err error var err error
var cpuRunner string var cpuRunner string
var estimate MemoryEstimate var estimate MemoryEstimate
@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
var systemFreeMemory uint64 var systemFreeMemory uint64
var systemSwapFreeMemory uint64 var systemSwapFreeMemory uint64
systemInfo := gpu.GetSystemInfo() systemInfo := discover.GetSystemInfo()
systemTotalMemory = systemInfo.System.TotalMemory systemTotalMemory = systemInfo.System.TotalMemory
systemFreeMemory = systemInfo.System.FreeMemory systemFreeMemory = systemInfo.System.FreeMemory
systemSwapFreeMemory = systemInfo.System.FreeSwap systemSwapFreeMemory = systemInfo.System.FreeSwap
@ -106,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info // If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
if opts.NumGPU == 0 { if opts.NumGPU == 0 {
gpus = gpu.GetCPUInfo() gpus = discover.GetCPUInfo()
} }
if len(gpus) == 1 && gpus[0].Library == "cpu" { if len(gpus) == 1 && gpus[0].Library == "cpu" {
cpuRunner = runners.ServerForCpu() cpuRunner = runners.ServerForCpu()
@ -122,7 +122,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
case gpus[0].Library != "metal" && estimate.Layers == 0: case gpus[0].Library != "metal" && estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit // Don't bother loading into the GPU if no layers can fit
cpuRunner = runners.ServerForCpu() cpuRunner = runners.ServerForCpu()
gpus = gpu.GetCPUInfo() gpus = discover.GetCPUInfo()
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
opts.NumGPU = estimate.Layers opts.NumGPU = estimate.Layers
} }
@ -281,7 +281,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} }
if strings.HasPrefix(servers[i], "cpu") { if strings.HasPrefix(servers[i], "cpu") {
gpus = gpu.GetCPUInfo() gpus = discover.GetCPUInfo()
} }
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race // Find an availableServers port, retry on each iteration in case the failure was a port conflict race

View file

@ -18,8 +18,8 @@ import (
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/gpu"
) )
const ( const (
@ -301,11 +301,11 @@ func GetAvailableServers(payloadsDir string) map[string]string {
// serversForGpu returns a list of compatible servers give the provided GPU // serversForGpu returns a list of compatible servers give the provided GPU
// info, ordered by performance. assumes Init() has been called // info, ordered by performance. assumes Init() has been called
// TODO - switch to metadata based mapping // TODO - switch to metadata based mapping
func ServersForGpu(info gpu.GpuInfo) []string { func ServersForGpu(info discover.GpuInfo) []string {
// glob workDir for files that start with ollama_ // glob workDir for files that start with ollama_
availableServers := GetAvailableServers(runnersDir) availableServers := GetAvailableServers(runnersDir)
requested := info.Library requested := info.Library
if info.Variant != gpu.CPUCapabilityNone.String() { if info.Variant != discover.CPUCapabilityNone.String() {
requested += "_" + info.Variant requested += "_" + info.Variant
} }
@ -341,12 +341,12 @@ func ServersForGpu(info gpu.GpuInfo) []string {
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") { if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
// Load up the best CPU variant if not primary requested // Load up the best CPU variant if not primary requested
if info.Library != "cpu" { if info.Library != "cpu" {
variant := gpu.GetCPUCapability() variant := discover.GetCPUCapability()
// If no variant, then we fall back to default // If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match // If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the // Attempting to run the wrong CPU instructions will panic the
// process // process
if variant != gpu.CPUCapabilityNone { if variant != discover.CPUCapabilityNone {
for cmp := range availableServers { for cmp := range availableServers {
if cmp == "cpu_"+variant.String() { if cmp == "cpu_"+variant.String() {
servers = append(servers, cmp) servers = append(servers, cmp)
@ -371,9 +371,9 @@ func ServerForCpu() string {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return "metal" return "metal"
} }
variant := gpu.GetCPUCapability() variant := discover.GetCPUCapability()
availableServers := GetAvailableServers(runnersDir) availableServers := GetAvailableServers(runnersDir)
if variant != gpu.CPUCapabilityNone { if variant != discover.CPUCapabilityNone {
for cmp := range availableServers { for cmp := range availableServers {
if cmp == "cpu_"+variant.String() { if cmp == "cpu_"+variant.String() {
return cmp return cmp

View file

@ -27,8 +27,8 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/build" "github.com/ollama/ollama/build"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/openai" "github.com/ollama/ollama/openai"
"github.com/ollama/ollama/parser" "github.com/ollama/ollama/parser"
@ -1235,7 +1235,7 @@ func Serve(ln net.Listener) error {
// At startup we retrieve GPU information so we can get log messages before loading a model // At startup we retrieve GPU information so we can get log messages before loading a model
// This will log warnings to the log in case we have problems with detected GPUs // This will log warnings to the log in case we have problems with detected GPUs
gpus := gpu.GetGPUInfo() gpus := discover.GetGPUInfo()
gpus.LogDetails() gpus.LogDetails()
err = srvr.Serve(ln) err = srvr.Serve(ln)

View file

@ -15,7 +15,7 @@ import (
"github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/gpu" "github.com/ollama/ollama/discover"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
@ -41,8 +41,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
return return
} }
func newMockServer(mock *mockRunner) func(gpu.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
return func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return mock, nil return mock, nil
} }
} }
@ -69,10 +69,10 @@ func TestGenerateChat(t *testing.T) {
unloadedCh: make(chan any, 1), unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock), newServerFn: newMockServer(&mock),
getGpuFn: gpu.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: gpu.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond, reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) { loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
// add small delay to simulate loading // add small delay to simulate loading
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
@ -367,10 +367,10 @@ func TestGenerate(t *testing.T) {
unloadedCh: make(chan any, 1), unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock), newServerFn: newMockServer(&mock),
getGpuFn: gpu.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: gpu.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond, reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) { loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
// add small delay to simulate loading // add small delay to simulate loading
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{

View file

@ -15,9 +15,9 @@ import (
"time" "time"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
@ -41,10 +41,10 @@ type Scheduler struct {
loaded map[string]*runnerRef loaded map[string]*runnerRef
loadedMu sync.Mutex loadedMu sync.Mutex
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) loadFn func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) newServerFn func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func() gpu.GpuInfoList getGpuFn func() discover.GpuInfoList
getCpuFn func() gpu.GpuInfoList getCpuFn func() discover.GpuInfoList
reschedDelay time.Duration reschedDelay time.Duration
} }
@ -69,8 +69,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
unloadedCh: make(chan interface{}, maxQueue), unloadedCh: make(chan interface{}, maxQueue),
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer, newServerFn: llm.NewLlamaServer,
getGpuFn: gpu.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: gpu.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond, reschedDelay: 250 * time.Millisecond,
} }
sched.loadFn = sched.load sched.loadFn = sched.load
@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
} else { } else {
// Either no models are loaded or below envconfig.MaxRunners // Either no models are loaded or below envconfig.MaxRunners
// Get a refreshed GPU list // Get a refreshed GPU list
var gpus gpu.GpuInfoList var gpus discover.GpuInfoList
if pending.opts.NumGPU == 0 { if pending.opts.NumGPU == 0 {
gpus = s.getCpuFn() gpus = s.getCpuFn()
} else { } else {
@ -409,7 +409,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
}() }()
} }
func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) { func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
if numParallel < 1 { if numParallel < 1 {
numParallel = 1 numParallel = 1
} }
@ -470,7 +470,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
}() }()
} }
func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) { func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
type predKey struct { type predKey struct {
Library string Library string
ID string ID string
@ -513,8 +513,8 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
// to avoid scheduling another model on the same GPU(s) that haven't stabilized. // to avoid scheduling another model on the same GPU(s) that haven't stabilized.
// This routine returns the set of GPUs that do not have an active loading model. // This routine returns the set of GPUs that do not have an active loading model.
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry) // If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList { func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
ret := append(gpu.GpuInfoList{}, allGpus...) ret := append(discover.GpuInfoList{}, allGpus...)
s.loadedMu.Lock() s.loadedMu.Lock()
defer s.loadedMu.Unlock() defer s.loadedMu.Unlock()
for _, runner := range s.loaded { for _, runner := range s.loaded {
@ -541,8 +541,8 @@ type runnerRef struct {
// unloading bool // set to true when we are trying to unload the runner // unloading bool // set to true when we are trying to unload the runner
llama llm.LlamaServer llama llm.LlamaServer
loading bool // True only during initial load, then false forever loading bool // True only during initial load, then false forever
gpus gpu.GpuInfoList // Recorded at time of provisioning gpus discover.GpuInfoList // Recorded at time of provisioning
estimatedVRAM uint64 estimatedVRAM uint64
estimatedTotal uint64 estimatedTotal uint64
@ -630,7 +630,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
start := time.Now() start := time.Now()
// Establish a baseline before we unload // Establish a baseline before we unload
gpusBefore := gpu.GetGPUInfo() gpusBefore := discover.GetGPUInfo()
var totalMemoryBefore, freeMemoryBefore uint64 var totalMemoryBefore, freeMemoryBefore uint64
for _, gpu := range gpusBefore { for _, gpu := range gpusBefore {
totalMemoryBefore += gpu.TotalMemory totalMemoryBefore += gpu.TotalMemory
@ -648,7 +648,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
} }
// Query GPUs, look for free to go back up // Query GPUs, look for free to go back up
gpusNow := gpu.GetGPUInfo() gpusNow := discover.GetGPUInfo()
var totalMemoryNow, freeMemoryNow uint64 var totalMemoryNow, freeMemoryNow uint64
for _, gpu := range gpusNow { for _, gpu := range gpusNow {
totalMemoryNow += gpu.TotalMemory totalMemoryNow += gpu.TotalMemory
@ -685,7 +685,7 @@ func (a ByDuration) Less(i, j int) bool {
// If the model can not be fit fully within the available GPU(s) nil is returned // If the model can not be fit fully within the available GPU(s) nil is returned
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
// opts.NumCtx accordingly // opts.NumCtx accordingly
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
var estimatedVRAM uint64 var estimatedVRAM uint64
var numParallelToTry []int var numParallelToTry []int
@ -698,22 +698,22 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
for _, gl := range gpus.ByLibrary() { for _, gl := range gpus.ByLibrary() {
var ok bool var ok bool
sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...) sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
// TODO - potentially sort by performance capability, existing models loaded, etc. // TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them // TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups // Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl))) sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
// First attempt to fit the model into a single GPU // First attempt to fit the model into a single GPU
for _, p := range numParallelToTry { for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread() { if !envconfig.SchedSpread() {
for _, g := range sgl { for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
*numParallel = p *numParallel = p
return []gpu.GpuInfo{g} return []discover.GpuInfo{g}
} }
} }
} }
@ -737,7 +737,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
} }
// If multiple Libraries are detected, pick the Library which loads the most layers for the model // If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
if *numParallel <= 0 { if *numParallel <= 0 {
*numParallel = 1 *numParallel = 1
req.opts.NumCtx = req.origNumCtx req.opts.NumCtx = req.origNumCtx
@ -822,7 +822,7 @@ func (s *Scheduler) expireRunner(model *Model) {
// If other runners are loaded, make sure the pending request will fit in system memory // If other runners are loaded, make sure the pending request will fit in system memory
// If not, pick a runner to unload, else return nil and the request can be loaded // If not, pick a runner to unload, else return nil and the request can be loaded
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef { func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
slog.Debug("evaluating if CPU model load will fit in available system memory") slog.Debug("evaluating if CPU model load will fit in available system memory")
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts) estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
if estimate.TotalSize <= gpus[0].FreeMemory { if estimate.TotalSize <= gpus[0].FreeMemory {

View file

@ -13,8 +13,8 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/app/lifecycle" "github.com/ollama/ollama/app/lifecycle"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
@ -47,10 +47,10 @@ func TestLoad(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2 * time.Second}, sessionDuration: &api.Duration{Duration: 2 * time.Second},
} }
// Fail to load model first // Fail to load model first
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return nil, errors.New("something failed to load model blah") return nil, errors.New("something failed to load model blah")
} }
gpus := gpu.GpuInfoList{} gpus := discover.GpuInfoList{}
s.load(req, ggml, gpus, 0) s.load(req, ggml, gpus, 0)
require.Empty(t, req.successCh) require.Empty(t, req.successCh)
require.Len(t, req.errCh, 1) require.Len(t, req.errCh, 1)
@ -61,7 +61,7 @@ func TestLoad(t *testing.T) {
require.Contains(t, err.Error(), "this model may be incompatible") require.Contains(t, err.Error(), "this model may be incompatible")
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return server, nil return server, nil
} }
s.load(req, ggml, gpus, 0) s.load(req, ggml, gpus, 0)
@ -102,7 +102,7 @@ type reqBundle struct {
ggml *llm.GGML ggml *llm.GGML
} }
func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return scenario.srv, nil return scenario.srv, nil
} }
@ -151,18 +151,18 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
return b return b
} }
func getGpuFn() gpu.GpuInfoList { func getGpuFn() discover.GpuInfoList {
g := gpu.GpuInfo{Library: "metal"} g := discover.GpuInfo{Library: "metal"}
g.TotalMemory = 24 * format.GigaByte g.TotalMemory = 24 * format.GigaByte
g.FreeMemory = 12 * format.GigaByte g.FreeMemory = 12 * format.GigaByte
return []gpu.GpuInfo{g} return []discover.GpuInfo{g}
} }
func getCpuFn() gpu.GpuInfoList { func getCpuFn() discover.GpuInfoList {
g := gpu.GpuInfo{Library: "cpu"} g := discover.GpuInfo{Library: "cpu"}
g.TotalMemory = 32 * format.GigaByte g.TotalMemory = 32 * format.GigaByte
g.FreeMemory = 26 * format.GigaByte g.FreeMemory = 26 * format.GigaByte
return []gpu.GpuInfo{g} return []discover.GpuInfo{g}
} }
func TestRequestsSameModelSameRequest(t *testing.T) { func TestRequestsSameModelSameRequest(t *testing.T) {
@ -420,9 +420,9 @@ func TestExpireRunner(t *testing.T) {
} }
var ggml *llm.GGML var ggml *llm.GGML
gpus := gpu.GpuInfoList{} gpus := discover.GpuInfoList{}
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return server, nil return server, nil
} }
s.load(req, ggml, gpus, 0) s.load(req, ggml, gpus, 0)
@ -460,11 +460,11 @@ func TestPrematureExpired(t *testing.T) {
// Same model, same request // Same model, same request
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil) scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.getGpuFn = func() gpu.GpuInfoList { s.getGpuFn = func() discover.GpuInfoList {
g := gpu.GpuInfo{Library: "metal"} g := discover.GpuInfo{Library: "metal"}
g.TotalMemory = 24 * format.GigaByte g.TotalMemory = 24 * format.GigaByte
g.FreeMemory = 12 * format.GigaByte g.FreeMemory = 12 * format.GigaByte
return []gpu.GpuInfo{g} return []discover.GpuInfo{g}
} }
s.newServerFn = scenario1a.newServer s.newServerFn = scenario1a.newServer
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration) successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
@ -530,7 +530,7 @@ func TestUseLoadedRunner(t *testing.T) {
func TestUpdateFreeSpace(t *testing.T) { func TestUpdateFreeSpace(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
gpus := gpu.GpuInfoList{ gpus := discover.GpuInfoList{
{ {
Library: "a", Library: "a",
ID: "1", ID: "1",
@ -563,7 +563,7 @@ func TestUpdateFreeSpace(t *testing.T) {
func TestFilterGPUsWithoutLoadingModels(t *testing.T) { func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
gpus := gpu.GpuInfoList{ gpus := discover.GpuInfoList{
{ {
Library: "cuda", Library: "cuda",
ID: "0", ID: "0",
@ -573,7 +573,7 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
ID: "1", ID: "1",
}, },
} }
r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true} r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.loadedMu.Lock() s.loadedMu.Lock()
@ -584,12 +584,12 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
require.Len(t, tmp, 1) require.Len(t, tmp, 1)
require.Equal(t, "1", tmp[0].ID) require.Equal(t, "1", tmp[0].ID)
r1.gpus = gpu.GpuInfoList{gpus[1]} r1.gpus = discover.GpuInfoList{gpus[1]}
tmp = s.filterGPUsWithoutLoadingModels(gpus) tmp = s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 1) require.Len(t, tmp, 1)
require.Equal(t, "0", tmp[0].ID) require.Equal(t, "0", tmp[0].ID)
r1.gpus = gpu.GpuInfoList{} r1.gpus = discover.GpuInfoList{}
tmp = s.filterGPUsWithoutLoadingModels(gpus) tmp = s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 2) require.Len(t, tmp, 2)
} }
@ -715,9 +715,9 @@ func TestHomogeneousGPUs(t *testing.T) {
defer done() defer done()
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.getGpuFn = func() gpu.GpuInfoList { s.getGpuFn = func() discover.GpuInfoList {
// Set memory values to require the model to be spread // Set memory values to require the model to be spread
gpus := []gpu.GpuInfo{ gpus := []discover.GpuInfo{
{Library: "cuda"}, {Library: "cuda"},
{Library: "rocm"}, {Library: "rocm"},
} }
@ -729,7 +729,7 @@ func TestHomogeneousGPUs(t *testing.T) {
} }
s.getCpuFn = getCpuFn s.getCpuFn = getCpuFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}) a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
require.Len(t, gpus, 1) require.Len(t, gpus, 1)
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel) return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
} }