parent
7d6eb0d4c3
commit
05cd82ef94
33 changed files with 94 additions and 94 deletions
|
@ -1,6 +1,6 @@
|
||||||
//go:build linux || windows
|
//go:build linux || windows
|
||||||
|
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
|
@ -1,4 +1,4 @@
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
|
@ -1,4 +1,4 @@
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
|
@ -1,4 +1,4 @@
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
|
@ -1,4 +1,4 @@
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
|
@ -1,6 +1,6 @@
|
||||||
//go:build linux || windows
|
//go:build linux || windows
|
||||||
|
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
"log/slog"
|
|
@ -1,6 +1,6 @@
|
||||||
//go:build linux || windows
|
//go:build linux || windows
|
||||||
|
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
|
@ -1,6 +1,6 @@
|
||||||
//go:build darwin
|
//go:build darwin
|
||||||
|
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#cgo CFLAGS: -x objective-c
|
#cgo CFLAGS: -x objective-c
|
|
@ -1,4 +1,4 @@
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
|
@ -1,6 +1,6 @@
|
||||||
//go:build linux || windows
|
//go:build linux || windows
|
||||||
|
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
"log/slog"
|
|
@ -1,4 +1,4 @@
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"runtime"
|
"runtime"
|
|
@ -1,4 +1,4 @@
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
|
@ -1,4 +1,4 @@
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
import "testing"
|
import "testing"
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
package gpu
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
|
@ -7,13 +7,13 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
// This algorithm looks for a complete fit to determine if we need to unload other models
|
||||||
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
||||||
// Split up the GPUs by type and try them
|
// Split up the GPUs by type and try them
|
||||||
var estimatedVRAM uint64
|
var estimatedVRAM uint64
|
||||||
for _, gpus := range allGpus.ByLibrary() {
|
for _, gpus := range allGpus.ByLibrary() {
|
||||||
|
@ -67,7 +67,7 @@ type MemoryEstimate struct {
|
||||||
|
|
||||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||||
// The GPUs provided must all be the same Library
|
// The GPUs provided must all be the same Library
|
||||||
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
|
func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
|
||||||
// Graph size for a partial offload, applies to all GPUs
|
// Graph size for a partial offload, applies to all GPUs
|
||||||
var graphPartialOffload uint64
|
var graphPartialOffload uint64
|
||||||
|
|
||||||
|
@ -157,7 +157,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
gpuAllocations := make([]uint64, len(gpus))
|
gpuAllocations := make([]uint64, len(gpus))
|
||||||
type gs struct {
|
type gs struct {
|
||||||
i int
|
i int
|
||||||
g *gpu.GpuInfo
|
g *discover.GpuInfo
|
||||||
}
|
}
|
||||||
gpusWithSpace := []gs{}
|
gpusWithSpace := []gs{}
|
||||||
for i := range gpus {
|
for i := range gpus {
|
||||||
|
|
|
@ -10,7 +10,7 @@ import (
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/discover"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestEstimateGPULayers(t *testing.T) {
|
func TestEstimateGPULayers(t *testing.T) {
|
||||||
|
@ -50,7 +50,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Simple CPU scenario
|
// Simple CPU scenario
|
||||||
gpus := []gpu.GpuInfo{
|
gpus := []discover.GpuInfo{
|
||||||
{
|
{
|
||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
},
|
},
|
||||||
|
@ -72,7 +72,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||||
|
|
||||||
// Dual CUDA scenario with assymetry
|
// Dual CUDA scenario with assymetry
|
||||||
gpuMinimumMemory := uint64(2048)
|
gpuMinimumMemory := uint64(2048)
|
||||||
gpus = []gpu.GpuInfo{
|
gpus = []discover.GpuInfo{
|
||||||
{
|
{
|
||||||
Library: "cuda",
|
Library: "cuda",
|
||||||
MinimumMemory: gpuMinimumMemory,
|
MinimumMemory: gpuMinimumMemory,
|
||||||
|
|
|
@ -26,9 +26,9 @@ import (
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/build"
|
"github.com/ollama/ollama/build"
|
||||||
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
|
||||||
"github.com/ollama/ollama/llama"
|
"github.com/ollama/ollama/llama"
|
||||||
"github.com/ollama/ollama/runners"
|
"github.com/ollama/ollama/runners"
|
||||||
)
|
)
|
||||||
|
@ -61,7 +61,7 @@ type llmServer struct {
|
||||||
estimate MemoryEstimate
|
estimate MemoryEstimate
|
||||||
totalLayers uint64
|
totalLayers uint64
|
||||||
// gpuCount int
|
// gpuCount int
|
||||||
gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
gpus discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
||||||
loadDuration time.Duration // Record how long it took the model to load
|
loadDuration time.Duration // Record how long it took the model to load
|
||||||
loadProgress float32
|
loadProgress float32
|
||||||
|
|
||||||
|
@ -90,7 +90,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
||||||
|
|
||||||
// NewLlamaServer will run a server for the given GPUs
|
// NewLlamaServer will run a server for the given GPUs
|
||||||
// The gpu list must be a single family.
|
// The gpu list must be a single family.
|
||||||
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||||
var err error
|
var err error
|
||||||
var cpuRunner string
|
var cpuRunner string
|
||||||
var estimate MemoryEstimate
|
var estimate MemoryEstimate
|
||||||
|
@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
var systemFreeMemory uint64
|
var systemFreeMemory uint64
|
||||||
var systemSwapFreeMemory uint64
|
var systemSwapFreeMemory uint64
|
||||||
|
|
||||||
systemInfo := gpu.GetSystemInfo()
|
systemInfo := discover.GetSystemInfo()
|
||||||
systemTotalMemory = systemInfo.System.TotalMemory
|
systemTotalMemory = systemInfo.System.TotalMemory
|
||||||
systemFreeMemory = systemInfo.System.FreeMemory
|
systemFreeMemory = systemInfo.System.FreeMemory
|
||||||
systemSwapFreeMemory = systemInfo.System.FreeSwap
|
systemSwapFreeMemory = systemInfo.System.FreeSwap
|
||||||
|
@ -106,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
|
|
||||||
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
||||||
if opts.NumGPU == 0 {
|
if opts.NumGPU == 0 {
|
||||||
gpus = gpu.GetCPUInfo()
|
gpus = discover.GetCPUInfo()
|
||||||
}
|
}
|
||||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||||
cpuRunner = runners.ServerForCpu()
|
cpuRunner = runners.ServerForCpu()
|
||||||
|
@ -122,7 +122,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
||||||
// Don't bother loading into the GPU if no layers can fit
|
// Don't bother loading into the GPU if no layers can fit
|
||||||
cpuRunner = runners.ServerForCpu()
|
cpuRunner = runners.ServerForCpu()
|
||||||
gpus = gpu.GetCPUInfo()
|
gpus = discover.GetCPUInfo()
|
||||||
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||||
opts.NumGPU = estimate.Layers
|
opts.NumGPU = estimate.Layers
|
||||||
}
|
}
|
||||||
|
@ -281,7 +281,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
}
|
}
|
||||||
|
|
||||||
if strings.HasPrefix(servers[i], "cpu") {
|
if strings.HasPrefix(servers[i], "cpu") {
|
||||||
gpus = gpu.GetCPUInfo()
|
gpus = discover.GetCPUInfo()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
|
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
|
||||||
|
|
|
@ -18,8 +18,8 @@ import (
|
||||||
|
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/gpu"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
@ -301,11 +301,11 @@ func GetAvailableServers(payloadsDir string) map[string]string {
|
||||||
// serversForGpu returns a list of compatible servers give the provided GPU
|
// serversForGpu returns a list of compatible servers give the provided GPU
|
||||||
// info, ordered by performance. assumes Init() has been called
|
// info, ordered by performance. assumes Init() has been called
|
||||||
// TODO - switch to metadata based mapping
|
// TODO - switch to metadata based mapping
|
||||||
func ServersForGpu(info gpu.GpuInfo) []string {
|
func ServersForGpu(info discover.GpuInfo) []string {
|
||||||
// glob workDir for files that start with ollama_
|
// glob workDir for files that start with ollama_
|
||||||
availableServers := GetAvailableServers(runnersDir)
|
availableServers := GetAvailableServers(runnersDir)
|
||||||
requested := info.Library
|
requested := info.Library
|
||||||
if info.Variant != gpu.CPUCapabilityNone.String() {
|
if info.Variant != discover.CPUCapabilityNone.String() {
|
||||||
requested += "_" + info.Variant
|
requested += "_" + info.Variant
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -341,12 +341,12 @@ func ServersForGpu(info gpu.GpuInfo) []string {
|
||||||
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
|
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
|
||||||
// Load up the best CPU variant if not primary requested
|
// Load up the best CPU variant if not primary requested
|
||||||
if info.Library != "cpu" {
|
if info.Library != "cpu" {
|
||||||
variant := gpu.GetCPUCapability()
|
variant := discover.GetCPUCapability()
|
||||||
// If no variant, then we fall back to default
|
// If no variant, then we fall back to default
|
||||||
// If we have a variant, try that if we find an exact match
|
// If we have a variant, try that if we find an exact match
|
||||||
// Attempting to run the wrong CPU instructions will panic the
|
// Attempting to run the wrong CPU instructions will panic the
|
||||||
// process
|
// process
|
||||||
if variant != gpu.CPUCapabilityNone {
|
if variant != discover.CPUCapabilityNone {
|
||||||
for cmp := range availableServers {
|
for cmp := range availableServers {
|
||||||
if cmp == "cpu_"+variant.String() {
|
if cmp == "cpu_"+variant.String() {
|
||||||
servers = append(servers, cmp)
|
servers = append(servers, cmp)
|
||||||
|
@ -371,9 +371,9 @@ func ServerForCpu() string {
|
||||||
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
||||||
return "metal"
|
return "metal"
|
||||||
}
|
}
|
||||||
variant := gpu.GetCPUCapability()
|
variant := discover.GetCPUCapability()
|
||||||
availableServers := GetAvailableServers(runnersDir)
|
availableServers := GetAvailableServers(runnersDir)
|
||||||
if variant != gpu.CPUCapabilityNone {
|
if variant != discover.CPUCapabilityNone {
|
||||||
for cmp := range availableServers {
|
for cmp := range availableServers {
|
||||||
if cmp == "cpu_"+variant.String() {
|
if cmp == "cpu_"+variant.String() {
|
||||||
return cmp
|
return cmp
|
||||||
|
|
|
@ -27,8 +27,8 @@ import (
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/build"
|
"github.com/ollama/ollama/build"
|
||||||
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/gpu"
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/openai"
|
"github.com/ollama/ollama/openai"
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
|
@ -1235,7 +1235,7 @@ func Serve(ln net.Listener) error {
|
||||||
|
|
||||||
// At startup we retrieve GPU information so we can get log messages before loading a model
|
// At startup we retrieve GPU information so we can get log messages before loading a model
|
||||||
// This will log warnings to the log in case we have problems with detected GPUs
|
// This will log warnings to the log in case we have problems with detected GPUs
|
||||||
gpus := gpu.GetGPUInfo()
|
gpus := discover.GetGPUInfo()
|
||||||
gpus.LogDetails()
|
gpus.LogDetails()
|
||||||
|
|
||||||
err = srvr.Serve(ln)
|
err = srvr.Serve(ln)
|
||||||
|
|
|
@ -15,7 +15,7 @@ import (
|
||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -41,8 +41,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func newMockServer(mock *mockRunner) func(gpu.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||||
return func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
return mock, nil
|
return mock, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -69,10 +69,10 @@ func TestGenerateChat(t *testing.T) {
|
||||||
unloadedCh: make(chan any, 1),
|
unloadedCh: make(chan any, 1),
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: newMockServer(&mock),
|
newServerFn: newMockServer(&mock),
|
||||||
getGpuFn: gpu.GetGPUInfo,
|
getGpuFn: discover.GetGPUInfo,
|
||||||
getCpuFn: gpu.GetCPUInfo,
|
getCpuFn: discover.GetCPUInfo,
|
||||||
reschedDelay: 250 * time.Millisecond,
|
reschedDelay: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
|
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
|
@ -367,10 +367,10 @@ func TestGenerate(t *testing.T) {
|
||||||
unloadedCh: make(chan any, 1),
|
unloadedCh: make(chan any, 1),
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: newMockServer(&mock),
|
newServerFn: newMockServer(&mock),
|
||||||
getGpuFn: gpu.GetGPUInfo,
|
getGpuFn: discover.GetGPUInfo,
|
||||||
getCpuFn: gpu.GetCPUInfo,
|
getCpuFn: discover.GetCPUInfo,
|
||||||
reschedDelay: 250 * time.Millisecond,
|
reschedDelay: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
|
loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
|
|
|
@ -15,9 +15,9 @@ import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -41,10 +41,10 @@ type Scheduler struct {
|
||||||
loaded map[string]*runnerRef
|
loaded map[string]*runnerRef
|
||||||
loadedMu sync.Mutex
|
loadedMu sync.Mutex
|
||||||
|
|
||||||
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int)
|
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
|
||||||
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
newServerFn func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||||
getGpuFn func() gpu.GpuInfoList
|
getGpuFn func() discover.GpuInfoList
|
||||||
getCpuFn func() gpu.GpuInfoList
|
getCpuFn func() discover.GpuInfoList
|
||||||
reschedDelay time.Duration
|
reschedDelay time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -69,8 +69,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
||||||
unloadedCh: make(chan interface{}, maxQueue),
|
unloadedCh: make(chan interface{}, maxQueue),
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: llm.NewLlamaServer,
|
newServerFn: llm.NewLlamaServer,
|
||||||
getGpuFn: gpu.GetGPUInfo,
|
getGpuFn: discover.GetGPUInfo,
|
||||||
getCpuFn: gpu.GetCPUInfo,
|
getCpuFn: discover.GetCPUInfo,
|
||||||
reschedDelay: 250 * time.Millisecond,
|
reschedDelay: 250 * time.Millisecond,
|
||||||
}
|
}
|
||||||
sched.loadFn = sched.load
|
sched.loadFn = sched.load
|
||||||
|
@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
} else {
|
} else {
|
||||||
// Either no models are loaded or below envconfig.MaxRunners
|
// Either no models are loaded or below envconfig.MaxRunners
|
||||||
// Get a refreshed GPU list
|
// Get a refreshed GPU list
|
||||||
var gpus gpu.GpuInfoList
|
var gpus discover.GpuInfoList
|
||||||
if pending.opts.NumGPU == 0 {
|
if pending.opts.NumGPU == 0 {
|
||||||
gpus = s.getCpuFn()
|
gpus = s.getCpuFn()
|
||||||
} else {
|
} else {
|
||||||
|
@ -409,7 +409,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
|
func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
||||||
if numParallel < 1 {
|
if numParallel < 1 {
|
||||||
numParallel = 1
|
numParallel = 1
|
||||||
}
|
}
|
||||||
|
@ -470,7 +470,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
|
||||||
type predKey struct {
|
type predKey struct {
|
||||||
Library string
|
Library string
|
||||||
ID string
|
ID string
|
||||||
|
@ -513,8 +513,8 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
||||||
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
|
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
|
||||||
// This routine returns the set of GPUs that do not have an active loading model.
|
// This routine returns the set of GPUs that do not have an active loading model.
|
||||||
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
|
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
|
||||||
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
|
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
|
||||||
ret := append(gpu.GpuInfoList{}, allGpus...)
|
ret := append(discover.GpuInfoList{}, allGpus...)
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
defer s.loadedMu.Unlock()
|
defer s.loadedMu.Unlock()
|
||||||
for _, runner := range s.loaded {
|
for _, runner := range s.loaded {
|
||||||
|
@ -542,7 +542,7 @@ type runnerRef struct {
|
||||||
|
|
||||||
llama llm.LlamaServer
|
llama llm.LlamaServer
|
||||||
loading bool // True only during initial load, then false forever
|
loading bool // True only during initial load, then false forever
|
||||||
gpus gpu.GpuInfoList // Recorded at time of provisioning
|
gpus discover.GpuInfoList // Recorded at time of provisioning
|
||||||
estimatedVRAM uint64
|
estimatedVRAM uint64
|
||||||
estimatedTotal uint64
|
estimatedTotal uint64
|
||||||
|
|
||||||
|
@ -630,7 +630,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
|
|
||||||
// Establish a baseline before we unload
|
// Establish a baseline before we unload
|
||||||
gpusBefore := gpu.GetGPUInfo()
|
gpusBefore := discover.GetGPUInfo()
|
||||||
var totalMemoryBefore, freeMemoryBefore uint64
|
var totalMemoryBefore, freeMemoryBefore uint64
|
||||||
for _, gpu := range gpusBefore {
|
for _, gpu := range gpusBefore {
|
||||||
totalMemoryBefore += gpu.TotalMemory
|
totalMemoryBefore += gpu.TotalMemory
|
||||||
|
@ -648,7 +648,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Query GPUs, look for free to go back up
|
// Query GPUs, look for free to go back up
|
||||||
gpusNow := gpu.GetGPUInfo()
|
gpusNow := discover.GetGPUInfo()
|
||||||
var totalMemoryNow, freeMemoryNow uint64
|
var totalMemoryNow, freeMemoryNow uint64
|
||||||
for _, gpu := range gpusNow {
|
for _, gpu := range gpusNow {
|
||||||
totalMemoryNow += gpu.TotalMemory
|
totalMemoryNow += gpu.TotalMemory
|
||||||
|
@ -685,7 +685,7 @@ func (a ByDuration) Less(i, j int) bool {
|
||||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||||
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
||||||
// opts.NumCtx accordingly
|
// opts.NumCtx accordingly
|
||||||
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||||
var estimatedVRAM uint64
|
var estimatedVRAM uint64
|
||||||
|
|
||||||
var numParallelToTry []int
|
var numParallelToTry []int
|
||||||
|
@ -698,22 +698,22 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
|
||||||
|
|
||||||
for _, gl := range gpus.ByLibrary() {
|
for _, gl := range gpus.ByLibrary() {
|
||||||
var ok bool
|
var ok bool
|
||||||
sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...)
|
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
|
||||||
|
|
||||||
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
||||||
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
||||||
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
|
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
|
||||||
sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
|
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
|
||||||
|
|
||||||
// First attempt to fit the model into a single GPU
|
// First attempt to fit the model into a single GPU
|
||||||
for _, p := range numParallelToTry {
|
for _, p := range numParallelToTry {
|
||||||
req.opts.NumCtx = req.origNumCtx * p
|
req.opts.NumCtx = req.origNumCtx * p
|
||||||
if !envconfig.SchedSpread() {
|
if !envconfig.SchedSpread() {
|
||||||
for _, g := range sgl {
|
for _, g := range sgl {
|
||||||
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||||
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
||||||
*numParallel = p
|
*numParallel = p
|
||||||
return []gpu.GpuInfo{g}
|
return []discover.GpuInfo{g}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -737,7 +737,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
|
||||||
}
|
}
|
||||||
|
|
||||||
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||||
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||||
if *numParallel <= 0 {
|
if *numParallel <= 0 {
|
||||||
*numParallel = 1
|
*numParallel = 1
|
||||||
req.opts.NumCtx = req.origNumCtx
|
req.opts.NumCtx = req.origNumCtx
|
||||||
|
@ -822,7 +822,7 @@ func (s *Scheduler) expireRunner(model *Model) {
|
||||||
|
|
||||||
// If other runners are loaded, make sure the pending request will fit in system memory
|
// If other runners are loaded, make sure the pending request will fit in system memory
|
||||||
// If not, pick a runner to unload, else return nil and the request can be loaded
|
// If not, pick a runner to unload, else return nil and the request can be loaded
|
||||||
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
|
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
|
||||||
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
||||||
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
|
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
|
||||||
if estimate.TotalSize <= gpus[0].FreeMemory {
|
if estimate.TotalSize <= gpus[0].FreeMemory {
|
||||||
|
|
|
@ -13,8 +13,8 @@ import (
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/app/lifecycle"
|
"github.com/ollama/ollama/app/lifecycle"
|
||||||
|
"github.com/ollama/ollama/discover"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -47,10 +47,10 @@ func TestLoad(t *testing.T) {
|
||||||
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
||||||
}
|
}
|
||||||
// Fail to load model first
|
// Fail to load model first
|
||||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
return nil, errors.New("something failed to load model blah")
|
return nil, errors.New("something failed to load model blah")
|
||||||
}
|
}
|
||||||
gpus := gpu.GpuInfoList{}
|
gpus := discover.GpuInfoList{}
|
||||||
s.load(req, ggml, gpus, 0)
|
s.load(req, ggml, gpus, 0)
|
||||||
require.Empty(t, req.successCh)
|
require.Empty(t, req.successCh)
|
||||||
require.Len(t, req.errCh, 1)
|
require.Len(t, req.errCh, 1)
|
||||||
|
@ -61,7 +61,7 @@ func TestLoad(t *testing.T) {
|
||||||
require.Contains(t, err.Error(), "this model may be incompatible")
|
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||||
|
|
||||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
return server, nil
|
return server, nil
|
||||||
}
|
}
|
||||||
s.load(req, ggml, gpus, 0)
|
s.load(req, ggml, gpus, 0)
|
||||||
|
@ -102,7 +102,7 @@ type reqBundle struct {
|
||||||
ggml *llm.GGML
|
ggml *llm.GGML
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
return scenario.srv, nil
|
return scenario.srv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -151,18 +151,18 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
||||||
return b
|
return b
|
||||||
}
|
}
|
||||||
|
|
||||||
func getGpuFn() gpu.GpuInfoList {
|
func getGpuFn() discover.GpuInfoList {
|
||||||
g := gpu.GpuInfo{Library: "metal"}
|
g := discover.GpuInfo{Library: "metal"}
|
||||||
g.TotalMemory = 24 * format.GigaByte
|
g.TotalMemory = 24 * format.GigaByte
|
||||||
g.FreeMemory = 12 * format.GigaByte
|
g.FreeMemory = 12 * format.GigaByte
|
||||||
return []gpu.GpuInfo{g}
|
return []discover.GpuInfo{g}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getCpuFn() gpu.GpuInfoList {
|
func getCpuFn() discover.GpuInfoList {
|
||||||
g := gpu.GpuInfo{Library: "cpu"}
|
g := discover.GpuInfo{Library: "cpu"}
|
||||||
g.TotalMemory = 32 * format.GigaByte
|
g.TotalMemory = 32 * format.GigaByte
|
||||||
g.FreeMemory = 26 * format.GigaByte
|
g.FreeMemory = 26 * format.GigaByte
|
||||||
return []gpu.GpuInfo{g}
|
return []discover.GpuInfo{g}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRequestsSameModelSameRequest(t *testing.T) {
|
func TestRequestsSameModelSameRequest(t *testing.T) {
|
||||||
|
@ -420,9 +420,9 @@ func TestExpireRunner(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
var ggml *llm.GGML
|
var ggml *llm.GGML
|
||||||
gpus := gpu.GpuInfoList{}
|
gpus := discover.GpuInfoList{}
|
||||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
return server, nil
|
return server, nil
|
||||||
}
|
}
|
||||||
s.load(req, ggml, gpus, 0)
|
s.load(req, ggml, gpus, 0)
|
||||||
|
@ -460,11 +460,11 @@ func TestPrematureExpired(t *testing.T) {
|
||||||
// Same model, same request
|
// Same model, same request
|
||||||
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
|
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
s.getGpuFn = func() gpu.GpuInfoList {
|
s.getGpuFn = func() discover.GpuInfoList {
|
||||||
g := gpu.GpuInfo{Library: "metal"}
|
g := discover.GpuInfo{Library: "metal"}
|
||||||
g.TotalMemory = 24 * format.GigaByte
|
g.TotalMemory = 24 * format.GigaByte
|
||||||
g.FreeMemory = 12 * format.GigaByte
|
g.FreeMemory = 12 * format.GigaByte
|
||||||
return []gpu.GpuInfo{g}
|
return []discover.GpuInfo{g}
|
||||||
}
|
}
|
||||||
s.newServerFn = scenario1a.newServer
|
s.newServerFn = scenario1a.newServer
|
||||||
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
|
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
|
||||||
|
@ -530,7 +530,7 @@ func TestUseLoadedRunner(t *testing.T) {
|
||||||
func TestUpdateFreeSpace(t *testing.T) {
|
func TestUpdateFreeSpace(t *testing.T) {
|
||||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
gpus := gpu.GpuInfoList{
|
gpus := discover.GpuInfoList{
|
||||||
{
|
{
|
||||||
Library: "a",
|
Library: "a",
|
||||||
ID: "1",
|
ID: "1",
|
||||||
|
@ -563,7 +563,7 @@ func TestUpdateFreeSpace(t *testing.T) {
|
||||||
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
|
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
|
||||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
gpus := gpu.GpuInfoList{
|
gpus := discover.GpuInfoList{
|
||||||
{
|
{
|
||||||
Library: "cuda",
|
Library: "cuda",
|
||||||
ID: "0",
|
ID: "0",
|
||||||
|
@ -573,7 +573,7 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
|
||||||
ID: "1",
|
ID: "1",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
|
r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
|
||||||
|
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
|
@ -584,12 +584,12 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
|
||||||
require.Len(t, tmp, 1)
|
require.Len(t, tmp, 1)
|
||||||
require.Equal(t, "1", tmp[0].ID)
|
require.Equal(t, "1", tmp[0].ID)
|
||||||
|
|
||||||
r1.gpus = gpu.GpuInfoList{gpus[1]}
|
r1.gpus = discover.GpuInfoList{gpus[1]}
|
||||||
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
||||||
require.Len(t, tmp, 1)
|
require.Len(t, tmp, 1)
|
||||||
require.Equal(t, "0", tmp[0].ID)
|
require.Equal(t, "0", tmp[0].ID)
|
||||||
|
|
||||||
r1.gpus = gpu.GpuInfoList{}
|
r1.gpus = discover.GpuInfoList{}
|
||||||
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
||||||
require.Len(t, tmp, 2)
|
require.Len(t, tmp, 2)
|
||||||
}
|
}
|
||||||
|
@ -715,9 +715,9 @@ func TestHomogeneousGPUs(t *testing.T) {
|
||||||
defer done()
|
defer done()
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
|
|
||||||
s.getGpuFn = func() gpu.GpuInfoList {
|
s.getGpuFn = func() discover.GpuInfoList {
|
||||||
// Set memory values to require the model to be spread
|
// Set memory values to require the model to be spread
|
||||||
gpus := []gpu.GpuInfo{
|
gpus := []discover.GpuInfo{
|
||||||
{Library: "cuda"},
|
{Library: "cuda"},
|
||||||
{Library: "rocm"},
|
{Library: "rocm"},
|
||||||
}
|
}
|
||||||
|
@ -729,7 +729,7 @@ func TestHomogeneousGPUs(t *testing.T) {
|
||||||
}
|
}
|
||||||
s.getCpuFn = getCpuFn
|
s.getCpuFn = getCpuFn
|
||||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
require.Len(t, gpus, 1)
|
require.Len(t, gpus, 1)
|
||||||
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue