2024-03-30 16:50:05 +00:00
|
|
|
package server
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"context"
|
|
|
|
"encoding/binary"
|
|
|
|
"fmt"
|
|
|
|
"log/slog"
|
|
|
|
"os"
|
|
|
|
"testing"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/ollama/ollama/api"
|
|
|
|
"github.com/ollama/ollama/app/lifecycle"
|
2024-05-22 05:21:04 +00:00
|
|
|
"github.com/ollama/ollama/envconfig"
|
2024-03-30 16:50:05 +00:00
|
|
|
"github.com/ollama/ollama/format"
|
|
|
|
"github.com/ollama/ollama/gpu"
|
|
|
|
"github.com/ollama/ollama/llm"
|
|
|
|
"github.com/stretchr/testify/require"
|
|
|
|
)
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
os.Setenv("OLLAMA_DEBUG", "1")
|
|
|
|
lifecycle.InitLogging()
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestInitScheduler(t *testing.T) {
|
|
|
|
ctx, done := context.WithCancel(context.Background())
|
|
|
|
defer done()
|
|
|
|
s := InitScheduler(ctx)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-03-30 16:50:05 +00:00
|
|
|
require.NotNil(t, s.loaded)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func TestLoad(t *testing.T) {
|
2024-05-03 16:08:33 +00:00
|
|
|
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
|
2024-03-30 16:50:05 +00:00
|
|
|
defer done()
|
|
|
|
s := InitScheduler(ctx)
|
2024-04-24 23:37:03 +00:00
|
|
|
var ggml *llm.GGML // value not used in tests
|
2024-03-30 16:50:05 +00:00
|
|
|
req := &LlmRequest{
|
|
|
|
ctx: ctx,
|
|
|
|
model: &Model{ModelPath: "foo"},
|
2024-04-28 20:40:31 +00:00
|
|
|
opts: api.DefaultOptions(),
|
2024-03-30 16:50:05 +00:00
|
|
|
successCh: make(chan *runnerRef, 1),
|
|
|
|
errCh: make(chan error, 1),
|
2024-07-02 22:12:43 +00:00
|
|
|
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
// Fail to load model first
|
2024-05-07 00:47:52 +00:00
|
|
|
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
2024-03-30 16:50:05 +00:00
|
|
|
return nil, fmt.Errorf("something failed to load model blah")
|
|
|
|
}
|
|
|
|
gpus := gpu.GpuInfoList{}
|
2024-05-07 00:47:52 +00:00
|
|
|
s.load(req, ggml, gpus, 0)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, req.successCh)
|
2024-03-30 16:50:05 +00:00
|
|
|
require.Len(t, req.errCh, 1)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.loaded)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
err := <-req.errCh
|
|
|
|
require.Contains(t, err.Error(), "this model may be incompatible")
|
|
|
|
|
2024-06-04 02:09:23 +00:00
|
|
|
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
2024-05-07 00:47:52 +00:00
|
|
|
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
2024-03-30 16:50:05 +00:00
|
|
|
return server, nil
|
|
|
|
}
|
2024-05-07 00:47:52 +00:00
|
|
|
s.load(req, ggml, gpus, 0)
|
2024-03-30 16:50:05 +00:00
|
|
|
select {
|
|
|
|
case err := <-req.errCh:
|
|
|
|
require.NoError(t, err)
|
|
|
|
case resp := <-req.successCh:
|
|
|
|
require.Equal(t, uint64(10), resp.estimatedVRAM)
|
|
|
|
require.Equal(t, uint(1), resp.refCount)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-03-30 16:50:05 +00:00
|
|
|
require.Len(t, s.loaded, 1)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
req.model.ModelPath = "dummy_model_path"
|
|
|
|
server.waitResp = fmt.Errorf("wait failure")
|
2024-05-07 00:47:52 +00:00
|
|
|
s.load(req, ggml, gpus, 0)
|
2024-03-30 16:50:05 +00:00
|
|
|
select {
|
|
|
|
case err := <-req.errCh:
|
|
|
|
require.Contains(t, err.Error(), "wait failure")
|
|
|
|
case resp := <-req.successCh:
|
2024-05-07 00:47:52 +00:00
|
|
|
t.Fatalf("unexpected success %v", resp)
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-03-30 16:50:05 +00:00
|
|
|
runner := s.loaded["dummy_model_path"]
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
require.NotNil(t, runner)
|
|
|
|
require.Equal(t, uint(0), runner.refCount)
|
2024-04-23 20:07:16 +00:00
|
|
|
time.Sleep(1 * time.Millisecond)
|
2024-03-30 16:50:05 +00:00
|
|
|
require.Len(t, s.expiredCh, 1)
|
|
|
|
}
|
|
|
|
|
|
|
|
type bundle struct {
|
|
|
|
ctx context.Context //nolint:containedctx
|
|
|
|
ctxDone func()
|
|
|
|
srv *mockLlm
|
|
|
|
req *LlmRequest
|
2024-04-24 23:17:24 +00:00
|
|
|
ggml *llm.GGML
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
|
2024-05-07 00:47:52 +00:00
|
|
|
func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
2024-03-30 16:50:05 +00:00
|
|
|
return scenario.srv, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64) *bundle {
|
|
|
|
scenario := &bundle{}
|
|
|
|
scenario.ctx, scenario.ctxDone = context.WithCancel(ctx)
|
|
|
|
t.Helper()
|
|
|
|
|
|
|
|
f, err := os.CreateTemp(t.TempDir(), modelName)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.NoError(t, err)
|
2024-03-30 16:50:05 +00:00
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
gguf := llm.NewGGUFV3(binary.LittleEndian)
|
|
|
|
err = gguf.Encode(f, llm.KV{
|
|
|
|
"general.architecture": "llama",
|
|
|
|
"general.name": "name",
|
|
|
|
"llama.context_length": uint32(32),
|
|
|
|
"llama.embedding_length": uint32(4096),
|
|
|
|
"llama.block_count": uint32(1),
|
|
|
|
"llama.attention.head_count": uint32(32),
|
|
|
|
"llama.attention.head_count_kv": uint32(32),
|
|
|
|
"tokenizer.ggml.tokens": []string{" "},
|
|
|
|
"tokenizer.ggml.scores": []float32{0},
|
|
|
|
"tokenizer.ggml.token_type": []int32{0},
|
|
|
|
}, []llm.Tensor{
|
2024-06-25 04:47:52 +00:00
|
|
|
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
|
|
|
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
2024-03-30 16:50:05 +00:00
|
|
|
})
|
2024-05-22 05:21:04 +00:00
|
|
|
require.NoError(t, err)
|
2024-04-24 23:17:24 +00:00
|
|
|
|
2024-03-30 16:50:05 +00:00
|
|
|
fname := f.Name()
|
|
|
|
model := &Model{Name: modelName, ModelPath: fname}
|
2024-06-25 04:47:52 +00:00
|
|
|
scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
|
2024-03-30 16:50:05 +00:00
|
|
|
require.NoError(t, err)
|
2024-04-24 23:17:24 +00:00
|
|
|
|
2024-03-30 16:50:05 +00:00
|
|
|
scenario.req = &LlmRequest{
|
|
|
|
ctx: scenario.ctx,
|
|
|
|
model: model,
|
2024-04-28 20:40:31 +00:00
|
|
|
opts: api.DefaultOptions(),
|
2024-07-02 22:12:43 +00:00
|
|
|
sessionDuration: &api.Duration{Duration: 5 * time.Millisecond},
|
2024-03-30 16:50:05 +00:00
|
|
|
successCh: make(chan *runnerRef, 1),
|
|
|
|
errCh: make(chan error, 1),
|
|
|
|
}
|
2024-06-04 02:09:23 +00:00
|
|
|
scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
|
2024-03-30 16:50:05 +00:00
|
|
|
return scenario
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestRequests(t *testing.T) {
|
2024-06-14 16:48:25 +00:00
|
|
|
ctx, done := context.WithTimeout(context.Background(), 10*time.Second)
|
2024-03-30 16:50:05 +00:00
|
|
|
defer done()
|
|
|
|
|
|
|
|
// Same model, same request
|
|
|
|
scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
|
2024-07-02 22:12:43 +00:00
|
|
|
scenario1a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
|
2024-03-30 16:50:05 +00:00
|
|
|
scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
|
|
|
|
scenario1b.req.model = scenario1a.req.model
|
2024-04-24 23:17:24 +00:00
|
|
|
scenario1b.ggml = scenario1a.ggml
|
2024-07-02 22:12:43 +00:00
|
|
|
scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
|
2024-03-30 16:50:05 +00:00
|
|
|
|
|
|
|
// simple reload of same model
|
|
|
|
scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
|
2024-05-14 00:17:36 +00:00
|
|
|
tmpModel := *scenario1a.req.model
|
|
|
|
scenario2a.req.model = &tmpModel
|
2024-04-24 23:17:24 +00:00
|
|
|
scenario2a.ggml = scenario1a.ggml
|
2024-07-02 22:12:43 +00:00
|
|
|
scenario2a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
|
2024-03-30 16:50:05 +00:00
|
|
|
|
|
|
|
// Multiple loaded models
|
|
|
|
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
|
|
|
|
scenario3b := newScenario(t, ctx, "ollama-model-3b", 24*format.GigaByte)
|
2024-04-28 20:40:31 +00:00
|
|
|
scenario3c := newScenario(t, ctx, "ollama-model-4a", 30)
|
|
|
|
scenario3c.req.opts.NumGPU = 0 // CPU load, will be allowed
|
|
|
|
scenario3d := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
|
2024-03-30 16:50:05 +00:00
|
|
|
|
|
|
|
s := InitScheduler(ctx)
|
|
|
|
s.getGpuFn = func() gpu.GpuInfoList {
|
|
|
|
g := gpu.GpuInfo{Library: "metal"}
|
|
|
|
g.TotalMemory = 24 * format.GigaByte
|
|
|
|
g.FreeMemory = 12 * format.GigaByte
|
|
|
|
return []gpu.GpuInfo{g}
|
|
|
|
}
|
2024-06-04 02:09:23 +00:00
|
|
|
s.getCpuFn = func() gpu.GpuInfoList {
|
|
|
|
g := gpu.GpuInfo{Library: "cpu"}
|
|
|
|
g.TotalMemory = 32 * format.GigaByte
|
|
|
|
g.FreeMemory = 26 * format.GigaByte
|
|
|
|
return []gpu.GpuInfo{g}
|
|
|
|
}
|
2024-03-30 16:50:05 +00:00
|
|
|
s.newServerFn = scenario1a.newServer
|
|
|
|
slog.Info("scenario1a")
|
|
|
|
s.pendingReqCh <- scenario1a.req
|
|
|
|
require.Len(t, s.pendingReqCh, 1)
|
|
|
|
s.Run(ctx)
|
|
|
|
select {
|
|
|
|
case resp := <-scenario1a.req.successCh:
|
|
|
|
require.Equal(t, resp.llama, scenario1a.srv)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.pendingReqCh)
|
|
|
|
require.Empty(t, scenario1a.req.errCh)
|
2024-07-03 21:47:42 +00:00
|
|
|
case err := <-scenario1a.req.errCh:
|
|
|
|
t.Fatal(err.Error())
|
2024-03-30 16:50:05 +00:00
|
|
|
case <-ctx.Done():
|
2024-05-07 00:47:52 +00:00
|
|
|
t.Fatal("timeout")
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Same runner as first request due to not needing a reload
|
|
|
|
s.newServerFn = scenario1b.newServer
|
|
|
|
slog.Info("scenario1b")
|
|
|
|
s.pendingReqCh <- scenario1b.req
|
|
|
|
select {
|
|
|
|
case resp := <-scenario1b.req.successCh:
|
|
|
|
require.Equal(t, resp.llama, scenario1a.srv)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.pendingReqCh)
|
|
|
|
require.Empty(t, scenario1b.req.errCh)
|
2024-07-03 21:47:42 +00:00
|
|
|
case err := <-scenario1b.req.errCh:
|
|
|
|
t.Fatal(err.Error())
|
2024-03-30 16:50:05 +00:00
|
|
|
case <-ctx.Done():
|
2024-05-07 00:47:52 +00:00
|
|
|
t.Fatal("timeout")
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Trigger a reload
|
|
|
|
s.newServerFn = scenario2a.newServer
|
|
|
|
scenario2a.req.model.AdapterPaths = []string{"new"}
|
|
|
|
slog.Info("scenario2a")
|
|
|
|
s.pendingReqCh <- scenario2a.req
|
|
|
|
// finish first two requests, so model can reload
|
|
|
|
time.Sleep(1 * time.Millisecond)
|
|
|
|
scenario1a.ctxDone()
|
|
|
|
scenario1b.ctxDone()
|
|
|
|
select {
|
|
|
|
case resp := <-scenario2a.req.successCh:
|
|
|
|
require.Equal(t, resp.llama, scenario2a.srv)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.pendingReqCh)
|
|
|
|
require.Empty(t, scenario2a.req.errCh)
|
2024-07-03 21:47:42 +00:00
|
|
|
case err := <-scenario2a.req.errCh:
|
|
|
|
t.Fatal(err.Error())
|
2024-03-30 16:50:05 +00:00
|
|
|
case <-ctx.Done():
|
2024-05-07 00:47:52 +00:00
|
|
|
t.Fatal("timeout")
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
|
2024-05-04 18:46:01 +00:00
|
|
|
envconfig.MaxRunners = 1
|
2024-03-30 16:50:05 +00:00
|
|
|
s.newServerFn = scenario3a.newServer
|
|
|
|
slog.Info("scenario3a")
|
|
|
|
s.pendingReqCh <- scenario3a.req
|
|
|
|
// finish prior request, so new model can load
|
|
|
|
time.Sleep(1 * time.Millisecond)
|
|
|
|
scenario2a.ctxDone()
|
|
|
|
select {
|
|
|
|
case resp := <-scenario3a.req.successCh:
|
|
|
|
require.Equal(t, resp.llama, scenario3a.srv)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.pendingReqCh)
|
|
|
|
require.Empty(t, scenario3a.req.errCh)
|
2024-07-03 21:47:42 +00:00
|
|
|
case err := <-scenario3a.req.errCh:
|
|
|
|
t.Fatal(err.Error())
|
2024-03-30 16:50:05 +00:00
|
|
|
case <-ctx.Done():
|
2024-05-07 00:47:52 +00:00
|
|
|
t.Fatal("timeout")
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-03-30 16:50:05 +00:00
|
|
|
require.Len(t, s.loaded, 1)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
|
2024-05-04 18:46:01 +00:00
|
|
|
envconfig.MaxRunners = 0
|
2024-03-30 16:50:05 +00:00
|
|
|
s.newServerFn = scenario3b.newServer
|
|
|
|
slog.Info("scenario3b")
|
|
|
|
s.pendingReqCh <- scenario3b.req
|
|
|
|
select {
|
|
|
|
case resp := <-scenario3b.req.successCh:
|
|
|
|
require.Equal(t, resp.llama, scenario3b.srv)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.pendingReqCh)
|
|
|
|
require.Empty(t, scenario3b.req.errCh)
|
2024-07-03 21:47:42 +00:00
|
|
|
case err := <-scenario3b.req.errCh:
|
|
|
|
t.Fatal(err.Error())
|
2024-03-30 16:50:05 +00:00
|
|
|
case <-ctx.Done():
|
2024-05-07 00:47:52 +00:00
|
|
|
t.Fatal("timeout")
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-03-30 16:50:05 +00:00
|
|
|
require.Len(t, s.loaded, 2)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
|
2024-04-28 20:40:31 +00:00
|
|
|
// This is a CPU load with NumGPU = 0 so it should load
|
2024-03-30 16:50:05 +00:00
|
|
|
s.newServerFn = scenario3c.newServer
|
|
|
|
slog.Info("scenario3c")
|
2024-04-28 20:40:31 +00:00
|
|
|
s.pendingReqCh <- scenario3c.req
|
|
|
|
select {
|
|
|
|
case resp := <-scenario3c.req.successCh:
|
|
|
|
require.Equal(t, resp.llama, scenario3c.srv)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.pendingReqCh)
|
|
|
|
require.Empty(t, scenario3c.req.errCh)
|
2024-07-03 21:47:42 +00:00
|
|
|
case err := <-scenario3c.req.errCh:
|
|
|
|
t.Fatal(err.Error())
|
2024-04-28 20:40:31 +00:00
|
|
|
case <-ctx.Done():
|
2024-05-07 00:47:52 +00:00
|
|
|
t.Fatal("timeout")
|
2024-04-28 20:40:31 +00:00
|
|
|
}
|
|
|
|
s.loadedMu.Lock()
|
|
|
|
require.Len(t, s.loaded, 3)
|
|
|
|
s.loadedMu.Unlock()
|
|
|
|
|
|
|
|
// Try to load a model that wont fit
|
|
|
|
s.newServerFn = scenario3d.newServer
|
|
|
|
slog.Info("scenario3d")
|
|
|
|
s.loadedMu.Lock()
|
|
|
|
require.Len(t, s.loaded, 3)
|
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
|
|
|
|
time.Sleep(2 * time.Millisecond)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.pendingReqCh <- scenario3d.req
|
2024-03-30 16:50:05 +00:00
|
|
|
// finish prior request, so new model can load
|
|
|
|
time.Sleep(6 * time.Millisecond)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
|
|
|
require.Len(t, s.loaded, 2)
|
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
scenario3b.ctxDone()
|
|
|
|
select {
|
2024-04-28 20:40:31 +00:00
|
|
|
case resp := <-scenario3d.req.successCh:
|
|
|
|
require.Equal(t, resp.llama, scenario3d.srv)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.pendingReqCh)
|
|
|
|
require.Empty(t, scenario3d.req.errCh)
|
2024-03-30 16:50:05 +00:00
|
|
|
case <-ctx.Done():
|
2024-05-07 00:47:52 +00:00
|
|
|
t.Fatal("timeout")
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
|
|
|
require.Len(t, s.loaded, 2)
|
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func TestGetRunner(t *testing.T) {
|
2024-05-03 16:08:33 +00:00
|
|
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
2024-03-30 16:50:05 +00:00
|
|
|
defer done()
|
|
|
|
|
|
|
|
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
|
2024-07-02 22:12:43 +00:00
|
|
|
scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
|
2024-03-30 16:50:05 +00:00
|
|
|
scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
|
2024-07-02 22:12:43 +00:00
|
|
|
scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
|
2024-03-30 16:50:05 +00:00
|
|
|
scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
|
2024-07-02 22:12:43 +00:00
|
|
|
scenario1c.req.sessionDuration = &api.Duration{Duration: 0}
|
2024-05-04 18:46:01 +00:00
|
|
|
envconfig.MaxQueuedRequests = 1
|
2024-03-30 16:50:05 +00:00
|
|
|
s := InitScheduler(ctx)
|
|
|
|
s.getGpuFn = func() gpu.GpuInfoList {
|
|
|
|
g := gpu.GpuInfo{Library: "metal"}
|
|
|
|
g.TotalMemory = 24 * format.GigaByte
|
|
|
|
g.FreeMemory = 12 * format.GigaByte
|
|
|
|
return []gpu.GpuInfo{g}
|
|
|
|
}
|
|
|
|
s.newServerFn = scenario1a.newServer
|
|
|
|
slog.Info("scenario1a")
|
|
|
|
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
|
|
|
|
require.Len(t, s.pendingReqCh, 1)
|
|
|
|
slog.Info("scenario1b")
|
|
|
|
successCh1b, errCh1b := s.GetRunner(scenario1b.ctx, scenario1b.req.model, scenario1b.req.opts, scenario1b.req.sessionDuration)
|
|
|
|
require.Len(t, s.pendingReqCh, 1)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, successCh1b)
|
2024-03-30 16:50:05 +00:00
|
|
|
require.Len(t, errCh1b, 1)
|
|
|
|
err := <-errCh1b
|
|
|
|
require.Contains(t, err.Error(), "server busy")
|
|
|
|
s.Run(ctx)
|
|
|
|
select {
|
|
|
|
case resp := <-successCh1a:
|
|
|
|
require.Equal(t, resp.llama, scenario1a.srv)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.pendingReqCh)
|
|
|
|
require.Empty(t, errCh1a)
|
2024-03-30 16:50:05 +00:00
|
|
|
case <-ctx.Done():
|
2024-05-07 00:47:52 +00:00
|
|
|
t.Fatal("timeout")
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
scenario1a.ctxDone()
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-03-30 16:50:05 +00:00
|
|
|
require.Len(t, s.loaded, 1)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
|
|
|
|
scenario1c.req.model.ModelPath = "bad path"
|
|
|
|
slog.Info("scenario1c")
|
|
|
|
successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
|
2024-05-06 21:15:37 +00:00
|
|
|
// Starts in pending channel, then should be quickly processsed to return an error
|
2024-04-24 23:17:24 +00:00
|
|
|
time.Sleep(5 * time.Millisecond)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, successCh1c)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.loaded)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
require.Len(t, errCh1c, 1)
|
|
|
|
err = <-errCh1c
|
|
|
|
require.Contains(t, err.Error(), "bad path")
|
|
|
|
scenario1b.ctxDone()
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO - add one scenario that triggers the bogus finished event with positive ref count
|
|
|
|
func TestPrematureExpired(t *testing.T) {
|
2024-05-03 16:08:33 +00:00
|
|
|
ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
2024-03-30 16:50:05 +00:00
|
|
|
defer done()
|
|
|
|
|
|
|
|
// Same model, same request
|
|
|
|
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
|
|
|
|
s := InitScheduler(ctx)
|
|
|
|
s.getGpuFn = func() gpu.GpuInfoList {
|
|
|
|
g := gpu.GpuInfo{Library: "metal"}
|
|
|
|
g.TotalMemory = 24 * format.GigaByte
|
|
|
|
g.FreeMemory = 12 * format.GigaByte
|
|
|
|
return []gpu.GpuInfo{g}
|
|
|
|
}
|
|
|
|
s.newServerFn = scenario1a.newServer
|
|
|
|
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
|
|
|
|
require.Len(t, s.pendingReqCh, 1)
|
|
|
|
s.Run(ctx)
|
|
|
|
select {
|
|
|
|
case resp := <-successCh1a:
|
|
|
|
require.Equal(t, resp.llama, scenario1a.srv)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.pendingReqCh)
|
|
|
|
require.Empty(t, errCh1a)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-03-30 16:50:05 +00:00
|
|
|
require.Len(t, s.loaded, 1)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
slog.Info("sending premature expired event now")
|
|
|
|
s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
|
|
|
|
case <-ctx.Done():
|
2024-05-07 00:47:52 +00:00
|
|
|
t.Fatal("timeout")
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
2024-07-02 22:12:43 +00:00
|
|
|
time.Sleep(scenario1a.req.sessionDuration.Duration)
|
2024-03-30 16:50:05 +00:00
|
|
|
scenario1a.ctxDone()
|
|
|
|
time.Sleep(20 * time.Millisecond)
|
|
|
|
require.LessOrEqual(t, len(s.finishedReqCh), 1)
|
|
|
|
time.Sleep(10 * time.Millisecond)
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.finishedReqCh)
|
2024-04-24 23:17:24 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-05-22 05:21:04 +00:00
|
|
|
require.Empty(t, s.loaded)
|
2024-04-24 23:17:24 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
|
|
|
|
// also shouldn't happen in real life
|
|
|
|
s.finishedReqCh <- scenario1a.req
|
|
|
|
time.Sleep(5 * time.Millisecond)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestUseLoadedRunner(t *testing.T) {
|
2024-05-03 16:08:33 +00:00
|
|
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
2024-03-30 16:50:05 +00:00
|
|
|
req := &LlmRequest{
|
|
|
|
ctx: ctx,
|
2024-04-28 20:40:31 +00:00
|
|
|
opts: api.DefaultOptions(),
|
2024-03-30 16:50:05 +00:00
|
|
|
successCh: make(chan *runnerRef, 1),
|
2024-07-02 22:12:43 +00:00
|
|
|
sessionDuration: &api.Duration{Duration: 2},
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
finished := make(chan *LlmRequest)
|
2024-06-04 02:09:23 +00:00
|
|
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
2024-05-07 00:47:52 +00:00
|
|
|
r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
|
2024-03-30 16:50:05 +00:00
|
|
|
req.useLoadedRunner(r1, finished)
|
|
|
|
require.Equal(t, uint(1), r1.refCount)
|
|
|
|
require.Equal(t, time.Duration(2), r1.sessionDuration)
|
|
|
|
select {
|
|
|
|
case success := <-req.successCh:
|
|
|
|
require.Equal(t, r1, success)
|
|
|
|
case <-ctx.Done():
|
2024-05-07 00:47:52 +00:00
|
|
|
t.Fatal("timeout")
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
done()
|
|
|
|
fin := <-finished
|
|
|
|
require.Equal(t, req, fin)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestUpdateFreeSpace(t *testing.T) {
|
2024-05-03 16:08:33 +00:00
|
|
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
2024-03-30 16:50:05 +00:00
|
|
|
defer done()
|
|
|
|
gpus := gpu.GpuInfoList{
|
|
|
|
{
|
|
|
|
Library: "a",
|
|
|
|
ID: "1",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Library: "a",
|
|
|
|
ID: "2",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
gpus[0].TotalMemory = 1000
|
|
|
|
gpus[0].FreeMemory = 900
|
|
|
|
gpus[1].TotalMemory = 2000
|
|
|
|
gpus[1].FreeMemory = 1900
|
2024-06-04 02:09:23 +00:00
|
|
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
|
|
|
|
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
|
2024-05-07 00:47:52 +00:00
|
|
|
r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
|
|
|
|
r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
|
2024-03-30 16:50:05 +00:00
|
|
|
|
|
|
|
s := InitScheduler(ctx)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-03-30 16:50:05 +00:00
|
|
|
s.loaded["a"] = r1
|
|
|
|
s.loaded["b"] = r2
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
|
|
|
|
s.updateFreeSpace(gpus)
|
2024-06-04 02:09:23 +00:00
|
|
|
require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
|
|
|
|
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
|
2024-06-05 19:07:20 +00:00
|
|
|
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
|
|
|
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
|
|
|
defer done()
|
|
|
|
gpus := gpu.GpuInfoList{
|
|
|
|
{
|
|
|
|
Library: "cuda",
|
|
|
|
ID: "0",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Library: "cuda",
|
|
|
|
ID: "1",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
|
|
|
|
|
|
|
|
s := InitScheduler(ctx)
|
|
|
|
s.loadedMu.Lock()
|
|
|
|
s.loaded["a"] = r1
|
|
|
|
s.loadedMu.Unlock()
|
|
|
|
|
|
|
|
tmp := s.filterGPUsWithoutLoadingModels(gpus)
|
|
|
|
require.Len(t, tmp, 1)
|
|
|
|
require.Equal(t, "1", tmp[0].ID)
|
|
|
|
|
|
|
|
r1.gpus = gpu.GpuInfoList{gpus[1]}
|
|
|
|
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
|
|
|
require.Len(t, tmp, 1)
|
|
|
|
require.Equal(t, "0", tmp[0].ID)
|
|
|
|
|
|
|
|
r1.gpus = gpu.GpuInfoList{}
|
|
|
|
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
|
|
|
require.Len(t, tmp, 2)
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func TestFindRunnerToUnload(t *testing.T) {
|
2024-05-03 16:08:33 +00:00
|
|
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
2024-03-30 16:50:05 +00:00
|
|
|
defer done()
|
2024-05-06 00:18:27 +00:00
|
|
|
|
2024-05-07 00:47:52 +00:00
|
|
|
r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
|
|
|
|
r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
|
2024-03-30 16:50:05 +00:00
|
|
|
|
|
|
|
s := InitScheduler(ctx)
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-03-30 16:50:05 +00:00
|
|
|
s.loaded["a"] = r1
|
|
|
|
s.loaded["b"] = r2
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
|
2024-05-06 00:18:27 +00:00
|
|
|
resp := s.findRunnerToUnload()
|
2024-03-30 16:50:05 +00:00
|
|
|
require.Equal(t, r2, resp)
|
|
|
|
r2.refCount = 1
|
2024-05-06 00:18:27 +00:00
|
|
|
resp = s.findRunnerToUnload()
|
2024-03-30 16:50:05 +00:00
|
|
|
require.Equal(t, r1, resp)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestNeedsReload(t *testing.T) {
|
2024-05-03 16:08:33 +00:00
|
|
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
2024-03-30 16:50:05 +00:00
|
|
|
defer done()
|
|
|
|
|
2024-06-04 02:09:23 +00:00
|
|
|
llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
2024-04-28 20:40:31 +00:00
|
|
|
do := api.DefaultOptions()
|
2024-03-30 16:50:05 +00:00
|
|
|
runner := &runnerRef{
|
2024-05-07 00:47:52 +00:00
|
|
|
model: &Model{
|
|
|
|
AdapterPaths: []string{"adapter1"},
|
|
|
|
ProjectorPaths: []string{"projector1"},
|
|
|
|
},
|
|
|
|
Options: &do,
|
|
|
|
llama: llm,
|
|
|
|
numParallel: 1,
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
req := &LlmRequest{
|
|
|
|
model: &Model{
|
|
|
|
AdapterPaths: []string{"adapter2"},
|
|
|
|
ProjectorPaths: []string{"projector2"},
|
|
|
|
},
|
2024-04-28 20:40:31 +00:00
|
|
|
opts: api.DefaultOptions(),
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
resp := runner.needsReload(ctx, req)
|
|
|
|
require.True(t, resp)
|
2024-05-14 00:17:36 +00:00
|
|
|
req.model.AdapterPaths = runner.model.AdapterPaths
|
2024-03-30 16:50:05 +00:00
|
|
|
resp = runner.needsReload(ctx, req)
|
|
|
|
require.True(t, resp)
|
2024-05-14 00:17:36 +00:00
|
|
|
req.model.ProjectorPaths = runner.model.ProjectorPaths
|
2024-03-30 16:50:05 +00:00
|
|
|
runner.loading = true
|
|
|
|
req.opts.NumBatch = 1234
|
|
|
|
resp = runner.needsReload(ctx, req)
|
|
|
|
require.True(t, resp)
|
|
|
|
req.opts.NumBatch = runner.Options.NumBatch
|
|
|
|
llm.pingResp = fmt.Errorf("foo")
|
|
|
|
resp = runner.needsReload(ctx, req)
|
|
|
|
require.True(t, resp)
|
|
|
|
llm.pingResp = nil
|
|
|
|
resp = runner.needsReload(ctx, req)
|
|
|
|
require.False(t, resp)
|
|
|
|
req.opts.NumGPU = 99
|
|
|
|
resp = runner.needsReload(ctx, req)
|
2024-04-25 23:02:40 +00:00
|
|
|
require.True(t, resp)
|
|
|
|
req.opts.NumGPU = -1
|
|
|
|
resp = runner.needsReload(ctx, req)
|
2024-03-30 16:50:05 +00:00
|
|
|
require.False(t, resp)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestUnloadAllRunners(t *testing.T) {
|
2024-05-03 16:08:33 +00:00
|
|
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
2024-03-30 16:50:05 +00:00
|
|
|
defer done()
|
|
|
|
|
2024-06-04 02:09:23 +00:00
|
|
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
|
|
|
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
2024-03-30 16:50:05 +00:00
|
|
|
s := InitScheduler(ctx)
|
|
|
|
s.unloadAllRunners()
|
|
|
|
|
2024-05-07 00:47:52 +00:00
|
|
|
r1 := &runnerRef{llama: llm1, numParallel: 1}
|
|
|
|
r2 := &runnerRef{llama: llm2, numParallel: 1}
|
2024-03-30 16:50:05 +00:00
|
|
|
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Lock()
|
2024-03-30 16:50:05 +00:00
|
|
|
s.loaded["a"] = r1
|
|
|
|
s.loaded["b"] = r2
|
2024-04-28 20:40:31 +00:00
|
|
|
s.loadedMu.Unlock()
|
2024-03-30 16:50:05 +00:00
|
|
|
s.unloadAllRunners()
|
|
|
|
|
|
|
|
require.True(t, llm1.closeCalled)
|
|
|
|
require.True(t, llm2.closeCalled)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestUnload(t *testing.T) {
|
2024-06-04 02:09:23 +00:00
|
|
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
2024-05-07 00:47:52 +00:00
|
|
|
r1 := &runnerRef{llama: llm1, numParallel: 1}
|
|
|
|
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
|
2024-03-30 16:50:05 +00:00
|
|
|
r1.unload()
|
|
|
|
require.True(t, llm1.closeCalled)
|
|
|
|
r2.unload()
|
2024-05-14 00:17:36 +00:00
|
|
|
require.Nil(t, r2.model)
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
|
2024-05-07 00:47:52 +00:00
|
|
|
func TestAlreadyCanceled(t *testing.T) {
|
|
|
|
ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
|
|
|
defer done()
|
|
|
|
dctx, done2 := context.WithCancel(ctx)
|
|
|
|
done2()
|
|
|
|
scenario1a := newScenario(t, dctx, "ollama-model-1", 10)
|
2024-07-02 22:12:43 +00:00
|
|
|
scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
|
2024-05-07 00:47:52 +00:00
|
|
|
s := InitScheduler(ctx)
|
|
|
|
slog.Info("scenario1a")
|
|
|
|
s.pendingReqCh <- scenario1a.req
|
|
|
|
require.Len(t, s.pendingReqCh, 1)
|
|
|
|
s.Run(ctx)
|
|
|
|
time.Sleep(5 * time.Millisecond)
|
|
|
|
require.Empty(t, s.pendingReqCh)
|
|
|
|
require.Empty(t, scenario1a.req.errCh)
|
|
|
|
require.Empty(t, scenario1a.req.successCh)
|
|
|
|
}
|
|
|
|
|
2024-03-30 16:50:05 +00:00
|
|
|
type mockLlm struct {
|
2024-06-04 02:09:23 +00:00
|
|
|
pingResp error
|
|
|
|
waitResp error
|
|
|
|
completionResp error
|
|
|
|
embeddingResp []float64
|
|
|
|
embeddingRespErr error
|
|
|
|
tokenizeResp []int
|
|
|
|
tokenizeRespErr error
|
|
|
|
detokenizeResp string
|
|
|
|
detonekizeRespErr error
|
|
|
|
closeResp error
|
|
|
|
closeCalled bool
|
|
|
|
estimatedVRAM uint64
|
|
|
|
estimatedTotal uint64
|
|
|
|
estimatedVRAMByGPU map[string]uint64
|
2024-03-30 16:50:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
|
|
|
|
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
|
|
|
|
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
|
|
|
|
return s.completionResp
|
|
|
|
}
|
|
|
|
func (s *mockLlm) Embedding(ctx context.Context, prompt string) ([]float64, error) {
|
|
|
|
return s.embeddingResp, s.embeddingRespErr
|
|
|
|
}
|
|
|
|
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
|
|
|
|
return s.tokenizeResp, s.tokenizeRespErr
|
|
|
|
}
|
|
|
|
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
|
|
|
return s.detokenizeResp, s.detonekizeRespErr
|
|
|
|
}
|
|
|
|
func (s *mockLlm) Close() error {
|
|
|
|
s.closeCalled = true
|
|
|
|
return s.closeResp
|
|
|
|
}
|
2024-06-04 02:09:23 +00:00
|
|
|
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
|
|
|
|
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
|
2024-06-05 19:07:20 +00:00
|
|
|
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
|