parent
7d6eb0d4c3
commit
05cd82ef94
33 changed files with 94 additions and 94 deletions
|
@ -1,6 +1,6 @@
|
|||
//go:build linux || windows
|
||||
|
||||
package gpu
|
||||
package discover
|
||||
|
||||
import (
|
||||
"errors"
|
|
@ -1,4 +1,4 @@
|
|||
package gpu
|
||||
package discover
|
||||
|
||||
import (
|
||||
"errors"
|
|
@ -1,4 +1,4 @@
|
|||
package gpu
|
||||
package discover
|
||||
|
||||
import (
|
||||
"bufio"
|
|
@ -1,4 +1,4 @@
|
|||
package gpu
|
||||
package discover
|
||||
|
||||
import (
|
||||
"bytes"
|
|
@ -1,4 +1,4 @@
|
|||
package gpu
|
||||
package discover
|
||||
|
||||
import (
|
||||
"os"
|
|
@ -1,6 +1,6 @@
|
|||
//go:build linux || windows
|
||||
|
||||
package gpu
|
||||
package discover
|
||||
|
||||
import (
|
||||
"log/slog"
|
|
@ -1,6 +1,6 @@
|
|||
//go:build linux || windows
|
||||
|
||||
package gpu
|
||||
package discover
|
||||
|
||||
/*
|
||||
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
|
@ -1,6 +1,6 @@
|
|||
//go:build darwin
|
||||
|
||||
package gpu
|
||||
package discover
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -x objective-c
|
|
@ -1,4 +1,4 @@
|
|||
package gpu
|
||||
package discover
|
||||
|
||||
import (
|
||||
"bufio"
|
|
@ -1,6 +1,6 @@
|
|||
//go:build linux || windows
|
||||
|
||||
package gpu
|
||||
package discover
|
||||
|
||||
import (
|
||||
"log/slog"
|
|
@ -1,4 +1,4 @@
|
|||
package gpu
|
||||
package discover
|
||||
|
||||
import (
|
||||
"runtime"
|
|
@ -1,4 +1,4 @@
|
|||
package gpu
|
||||
package discover
|
||||
|
||||
import (
|
||||
"fmt"
|
|
@ -1,4 +1,4 @@
|
|||
package gpu
|
||||
package discover
|
||||
|
||||
import "testing"
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package gpu
|
||||
package discover
|
||||
|
||||
import (
|
||||
"fmt"
|
|
@ -7,13 +7,13 @@ import (
|
|||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
)
|
||||
|
||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
||||
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
||||
func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
||||
// Split up the GPUs by type and try them
|
||||
var estimatedVRAM uint64
|
||||
for _, gpus := range allGpus.ByLibrary() {
|
||||
|
@ -67,7 +67,7 @@ type MemoryEstimate struct {
|
|||
|
||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||
// The GPUs provided must all be the same Library
|
||||
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
|
||||
func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
|
||||
// Graph size for a partial offload, applies to all GPUs
|
||||
var graphPartialOffload uint64
|
||||
|
||||
|
@ -157,7 +157,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||
gpuAllocations := make([]uint64, len(gpus))
|
||||
type gs struct {
|
||||
i int
|
||||
g *gpu.GpuInfo
|
||||
g *discover.GpuInfo
|
||||
}
|
||||
gpusWithSpace := []gs{}
|
||||
for i := range gpus {
|
||||
|
|
|
@ -10,7 +10,7 @@ import (
|
|||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/discover"
|
||||
)
|
||||
|
||||
func TestEstimateGPULayers(t *testing.T) {
|
||||
|
@ -50,7 +50,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
|||
}
|
||||
|
||||
// Simple CPU scenario
|
||||
gpus := []gpu.GpuInfo{
|
||||
gpus := []discover.GpuInfo{
|
||||
{
|
||||
Library: "cpu",
|
||||
},
|
||||
|
@ -72,7 +72,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
|||
|
||||
// Dual CUDA scenario with assymetry
|
||||
gpuMinimumMemory := uint64(2048)
|
||||
gpus = []gpu.GpuInfo{
|
||||
gpus = []discover.GpuInfo{
|
||||
{
|
||||
Library: "cuda",
|
||||
MinimumMemory: gpuMinimumMemory,
|
||||
|
|
|
@ -26,9 +26,9 @@ import (
|
|||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/build"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llama"
|
||||
"github.com/ollama/ollama/runners"
|
||||
)
|
||||
|
@ -61,8 +61,8 @@ type llmServer struct {
|
|||
estimate MemoryEstimate
|
||||
totalLayers uint64
|
||||
// gpuCount int
|
||||
gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
||||
loadDuration time.Duration // Record how long it took the model to load
|
||||
gpus discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
||||
loadDuration time.Duration // Record how long it took the model to load
|
||||
loadProgress float32
|
||||
|
||||
sem *semaphore.Weighted
|
||||
|
@ -90,7 +90,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
|||
|
||||
// NewLlamaServer will run a server for the given GPUs
|
||||
// The gpu list must be a single family.
|
||||
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||
var err error
|
||||
var cpuRunner string
|
||||
var estimate MemoryEstimate
|
||||
|
@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
var systemFreeMemory uint64
|
||||
var systemSwapFreeMemory uint64
|
||||
|
||||
systemInfo := gpu.GetSystemInfo()
|
||||
systemInfo := discover.GetSystemInfo()
|
||||
systemTotalMemory = systemInfo.System.TotalMemory
|
||||
systemFreeMemory = systemInfo.System.FreeMemory
|
||||
systemSwapFreeMemory = systemInfo.System.FreeSwap
|
||||
|
@ -106,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
|
||||
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
||||
if opts.NumGPU == 0 {
|
||||
gpus = gpu.GetCPUInfo()
|
||||
gpus = discover.GetCPUInfo()
|
||||
}
|
||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||
cpuRunner = runners.ServerForCpu()
|
||||
|
@ -122,7 +122,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
||||
// Don't bother loading into the GPU if no layers can fit
|
||||
cpuRunner = runners.ServerForCpu()
|
||||
gpus = gpu.GetCPUInfo()
|
||||
gpus = discover.GetCPUInfo()
|
||||
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||
opts.NumGPU = estimate.Layers
|
||||
}
|
||||
|
@ -281,7 +281,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
}
|
||||
|
||||
if strings.HasPrefix(servers[i], "cpu") {
|
||||
gpus = gpu.GetCPUInfo()
|
||||
gpus = discover.GetCPUInfo()
|
||||
}
|
||||
|
||||
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
|
||||
|
|
|
@ -18,8 +18,8 @@ import (
|
|||
|
||||
"golang.org/x/sync/errgroup"
|
||||
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -301,11 +301,11 @@ func GetAvailableServers(payloadsDir string) map[string]string {
|
|||
// serversForGpu returns a list of compatible servers give the provided GPU
|
||||
// info, ordered by performance. assumes Init() has been called
|
||||
// TODO - switch to metadata based mapping
|
||||
func ServersForGpu(info gpu.GpuInfo) []string {
|
||||
func ServersForGpu(info discover.GpuInfo) []string {
|
||||
// glob workDir for files that start with ollama_
|
||||
availableServers := GetAvailableServers(runnersDir)
|
||||
requested := info.Library
|
||||
if info.Variant != gpu.CPUCapabilityNone.String() {
|
||||
if info.Variant != discover.CPUCapabilityNone.String() {
|
||||
requested += "_" + info.Variant
|
||||
}
|
||||
|
||||
|
@ -341,12 +341,12 @@ func ServersForGpu(info gpu.GpuInfo) []string {
|
|||
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
|
||||
// Load up the best CPU variant if not primary requested
|
||||
if info.Library != "cpu" {
|
||||
variant := gpu.GetCPUCapability()
|
||||
variant := discover.GetCPUCapability()
|
||||
// If no variant, then we fall back to default
|
||||
// If we have a variant, try that if we find an exact match
|
||||
// Attempting to run the wrong CPU instructions will panic the
|
||||
// process
|
||||
if variant != gpu.CPUCapabilityNone {
|
||||
if variant != discover.CPUCapabilityNone {
|
||||
for cmp := range availableServers {
|
||||
if cmp == "cpu_"+variant.String() {
|
||||
servers = append(servers, cmp)
|
||||
|
@ -371,9 +371,9 @@ func ServerForCpu() string {
|
|||
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
||||
return "metal"
|
||||
}
|
||||
variant := gpu.GetCPUCapability()
|
||||
variant := discover.GetCPUCapability()
|
||||
availableServers := GetAvailableServers(runnersDir)
|
||||
if variant != gpu.CPUCapabilityNone {
|
||||
if variant != discover.CPUCapabilityNone {
|
||||
for cmp := range availableServers {
|
||||
if cmp == "cpu_"+variant.String() {
|
||||
return cmp
|
||||
|
|
|
@ -27,8 +27,8 @@ import (
|
|||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/build"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/openai"
|
||||
"github.com/ollama/ollama/parser"
|
||||
|
@ -1235,7 +1235,7 @@ func Serve(ln net.Listener) error {
|
|||
|
||||
// At startup we retrieve GPU information so we can get log messages before loading a model
|
||||
// This will log warnings to the log in case we have problems with detected GPUs
|
||||
gpus := gpu.GetGPUInfo()
|
||||
gpus := discover.GetGPUInfo()
|
||||
gpus.LogDetails()
|
||||
|
||||
err = srvr.Serve(ln)
|
||||
|
|
|
@ -15,7 +15,7 @@ import (
|
|||
"github.com/google/go-cmp/cmp"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
|
@ -41,8 +41,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
|
|||
return
|
||||
}
|
||||
|
||||
func newMockServer(mock *mockRunner) func(gpu.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||
return func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||
return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return mock, nil
|
||||
}
|
||||
}
|
||||
|
@ -69,10 +69,10 @@ func TestGenerateChat(t *testing.T) {
|
|||
unloadedCh: make(chan any, 1),
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: gpu.GetGPUInfo,
|
||||
getCpuFn: gpu.GetCPUInfo,
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
reschedDelay: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
|
||||
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
|
@ -367,10 +367,10 @@ func TestGenerate(t *testing.T) {
|
|||
unloadedCh: make(chan any, 1),
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: newMockServer(&mock),
|
||||
getGpuFn: gpu.GetGPUInfo,
|
||||
getCpuFn: gpu.GetCPUInfo,
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
reschedDelay: 250 * time.Millisecond,
|
||||
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
|
||||
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||
// add small delay to simulate loading
|
||||
time.Sleep(time.Millisecond)
|
||||
req.successCh <- &runnerRef{
|
||||
|
|
|
@ -15,9 +15,9 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
|
@ -41,10 +41,10 @@ type Scheduler struct {
|
|||
loaded map[string]*runnerRef
|
||||
loadedMu sync.Mutex
|
||||
|
||||
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int)
|
||||
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||
getGpuFn func() gpu.GpuInfoList
|
||||
getCpuFn func() gpu.GpuInfoList
|
||||
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
|
||||
newServerFn func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||
getGpuFn func() discover.GpuInfoList
|
||||
getCpuFn func() discover.GpuInfoList
|
||||
reschedDelay time.Duration
|
||||
}
|
||||
|
||||
|
@ -69,8 +69,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
|||
unloadedCh: make(chan interface{}, maxQueue),
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: llm.NewLlamaServer,
|
||||
getGpuFn: gpu.GetGPUInfo,
|
||||
getCpuFn: gpu.GetCPUInfo,
|
||||
getGpuFn: discover.GetGPUInfo,
|
||||
getCpuFn: discover.GetCPUInfo,
|
||||
reschedDelay: 250 * time.Millisecond,
|
||||
}
|
||||
sched.loadFn = sched.load
|
||||
|
@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||
} else {
|
||||
// Either no models are loaded or below envconfig.MaxRunners
|
||||
// Get a refreshed GPU list
|
||||
var gpus gpu.GpuInfoList
|
||||
var gpus discover.GpuInfoList
|
||||
if pending.opts.NumGPU == 0 {
|
||||
gpus = s.getCpuFn()
|
||||
} else {
|
||||
|
@ -409,7 +409,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
|||
}()
|
||||
}
|
||||
|
||||
func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
|
||||
func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||
if numParallel < 1 {
|
||||
numParallel = 1
|
||||
}
|
||||
|
@ -470,7 +470,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
|
|||
}()
|
||||
}
|
||||
|
||||
func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
||||
func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
|
||||
type predKey struct {
|
||||
Library string
|
||||
ID string
|
||||
|
@ -513,8 +513,8 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
|||
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
|
||||
// This routine returns the set of GPUs that do not have an active loading model.
|
||||
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
|
||||
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
|
||||
ret := append(gpu.GpuInfoList{}, allGpus...)
|
||||
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
|
||||
ret := append(discover.GpuInfoList{}, allGpus...)
|
||||
s.loadedMu.Lock()
|
||||
defer s.loadedMu.Unlock()
|
||||
for _, runner := range s.loaded {
|
||||
|
@ -541,8 +541,8 @@ type runnerRef struct {
|
|||
// unloading bool // set to true when we are trying to unload the runner
|
||||
|
||||
llama llm.LlamaServer
|
||||
loading bool // True only during initial load, then false forever
|
||||
gpus gpu.GpuInfoList // Recorded at time of provisioning
|
||||
loading bool // True only during initial load, then false forever
|
||||
gpus discover.GpuInfoList // Recorded at time of provisioning
|
||||
estimatedVRAM uint64
|
||||
estimatedTotal uint64
|
||||
|
||||
|
@ -630,7 +630,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
|||
start := time.Now()
|
||||
|
||||
// Establish a baseline before we unload
|
||||
gpusBefore := gpu.GetGPUInfo()
|
||||
gpusBefore := discover.GetGPUInfo()
|
||||
var totalMemoryBefore, freeMemoryBefore uint64
|
||||
for _, gpu := range gpusBefore {
|
||||
totalMemoryBefore += gpu.TotalMemory
|
||||
|
@ -648,7 +648,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
|||
}
|
||||
|
||||
// Query GPUs, look for free to go back up
|
||||
gpusNow := gpu.GetGPUInfo()
|
||||
gpusNow := discover.GetGPUInfo()
|
||||
var totalMemoryNow, freeMemoryNow uint64
|
||||
for _, gpu := range gpusNow {
|
||||
totalMemoryNow += gpu.TotalMemory
|
||||
|
@ -685,7 +685,7 @@ func (a ByDuration) Less(i, j int) bool {
|
|||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
||||
// opts.NumCtx accordingly
|
||||
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||
var estimatedVRAM uint64
|
||||
|
||||
var numParallelToTry []int
|
||||
|
@ -698,22 +698,22 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
|
|||
|
||||
for _, gl := range gpus.ByLibrary() {
|
||||
var ok bool
|
||||
sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...)
|
||||
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
|
||||
|
||||
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
||||
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
||||
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
|
||||
sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
|
||||
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
|
||||
|
||||
// First attempt to fit the model into a single GPU
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
if !envconfig.SchedSpread() {
|
||||
for _, g := range sgl {
|
||||
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
||||
*numParallel = p
|
||||
return []gpu.GpuInfo{g}
|
||||
return []discover.GpuInfo{g}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -737,7 +737,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
|
|||
}
|
||||
|
||||
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||
if *numParallel <= 0 {
|
||||
*numParallel = 1
|
||||
req.opts.NumCtx = req.origNumCtx
|
||||
|
@ -822,7 +822,7 @@ func (s *Scheduler) expireRunner(model *Model) {
|
|||
|
||||
// If other runners are loaded, make sure the pending request will fit in system memory
|
||||
// If not, pick a runner to unload, else return nil and the request can be loaded
|
||||
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
|
||||
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
|
||||
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
||||
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
|
||||
if estimate.TotalSize <= gpus[0].FreeMemory {
|
||||
|
|
|
@ -13,8 +13,8 @@ import (
|
|||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/app/lifecycle"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
|
@ -47,10 +47,10 @@ func TestLoad(t *testing.T) {
|
|||
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
||||
}
|
||||
// Fail to load model first
|
||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return nil, errors.New("something failed to load model blah")
|
||||
}
|
||||
gpus := gpu.GpuInfoList{}
|
||||
gpus := discover.GpuInfoList{}
|
||||
s.load(req, ggml, gpus, 0)
|
||||
require.Empty(t, req.successCh)
|
||||
require.Len(t, req.errCh, 1)
|
||||
|
@ -61,7 +61,7 @@ func TestLoad(t *testing.T) {
|
|||
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||
|
||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, ggml, gpus, 0)
|
||||
|
@ -102,7 +102,7 @@ type reqBundle struct {
|
|||
ggml *llm.GGML
|
||||
}
|
||||
|
||||
func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return scenario.srv, nil
|
||||
}
|
||||
|
||||
|
@ -151,18 +151,18 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
|||
return b
|
||||
}
|
||||
|
||||
func getGpuFn() gpu.GpuInfoList {
|
||||
g := gpu.GpuInfo{Library: "metal"}
|
||||
func getGpuFn() discover.GpuInfoList {
|
||||
g := discover.GpuInfo{Library: "metal"}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 12 * format.GigaByte
|
||||
return []gpu.GpuInfo{g}
|
||||
return []discover.GpuInfo{g}
|
||||
}
|
||||
|
||||
func getCpuFn() gpu.GpuInfoList {
|
||||
g := gpu.GpuInfo{Library: "cpu"}
|
||||
func getCpuFn() discover.GpuInfoList {
|
||||
g := discover.GpuInfo{Library: "cpu"}
|
||||
g.TotalMemory = 32 * format.GigaByte
|
||||
g.FreeMemory = 26 * format.GigaByte
|
||||
return []gpu.GpuInfo{g}
|
||||
return []discover.GpuInfo{g}
|
||||
}
|
||||
|
||||
func TestRequestsSameModelSameRequest(t *testing.T) {
|
||||
|
@ -420,9 +420,9 @@ func TestExpireRunner(t *testing.T) {
|
|||
}
|
||||
|
||||
var ggml *llm.GGML
|
||||
gpus := gpu.GpuInfoList{}
|
||||
gpus := discover.GpuInfoList{}
|
||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, ggml, gpus, 0)
|
||||
|
@ -460,11 +460,11 @@ func TestPrematureExpired(t *testing.T) {
|
|||
// Same model, same request
|
||||
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
|
||||
s := InitScheduler(ctx)
|
||||
s.getGpuFn = func() gpu.GpuInfoList {
|
||||
g := gpu.GpuInfo{Library: "metal"}
|
||||
s.getGpuFn = func() discover.GpuInfoList {
|
||||
g := discover.GpuInfo{Library: "metal"}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 12 * format.GigaByte
|
||||
return []gpu.GpuInfo{g}
|
||||
return []discover.GpuInfo{g}
|
||||
}
|
||||
s.newServerFn = scenario1a.newServer
|
||||
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
|
||||
|
@ -530,7 +530,7 @@ func TestUseLoadedRunner(t *testing.T) {
|
|||
func TestUpdateFreeSpace(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
gpus := gpu.GpuInfoList{
|
||||
gpus := discover.GpuInfoList{
|
||||
{
|
||||
Library: "a",
|
||||
ID: "1",
|
||||
|
@ -563,7 +563,7 @@ func TestUpdateFreeSpace(t *testing.T) {
|
|||
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
gpus := gpu.GpuInfoList{
|
||||
gpus := discover.GpuInfoList{
|
||||
{
|
||||
Library: "cuda",
|
||||
ID: "0",
|
||||
|
@ -573,7 +573,7 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
|
|||
ID: "1",
|
||||
},
|
||||
}
|
||||
r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
|
||||
r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
|
||||
|
||||
s := InitScheduler(ctx)
|
||||
s.loadedMu.Lock()
|
||||
|
@ -584,12 +584,12 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
|
|||
require.Len(t, tmp, 1)
|
||||
require.Equal(t, "1", tmp[0].ID)
|
||||
|
||||
r1.gpus = gpu.GpuInfoList{gpus[1]}
|
||||
r1.gpus = discover.GpuInfoList{gpus[1]}
|
||||
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
||||
require.Len(t, tmp, 1)
|
||||
require.Equal(t, "0", tmp[0].ID)
|
||||
|
||||
r1.gpus = gpu.GpuInfoList{}
|
||||
r1.gpus = discover.GpuInfoList{}
|
||||
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
||||
require.Len(t, tmp, 2)
|
||||
}
|
||||
|
@ -715,9 +715,9 @@ func TestHomogeneousGPUs(t *testing.T) {
|
|||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
|
||||
s.getGpuFn = func() gpu.GpuInfoList {
|
||||
s.getGpuFn = func() discover.GpuInfoList {
|
||||
// Set memory values to require the model to be spread
|
||||
gpus := []gpu.GpuInfo{
|
||||
gpus := []discover.GpuInfo{
|
||||
{Library: "cuda"},
|
||||
{Library: "rocm"},
|
||||
}
|
||||
|
@ -729,7 +729,7 @@ func TestHomogeneousGPUs(t *testing.T) {
|
|||
}
|
||||
s.getCpuFn = getCpuFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
require.Len(t, gpus, 1)
|
||||
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue