ollama/server/llm_test.go

//go:build integration

package server

import (
	"context"
	"os"
	"strings"
	"sync"
	"testing"
	"time"

	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"

	"github.com/jmorganca/ollama/api"
	"github.com/jmorganca/ollama/llm"
)

// TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
//        package to avoid circular dependencies

// WARNING - these tests will fail on mac if you don't manually copy ggml-metal.metal to this dir (./server)
//
// TODO - Fix this ^^

var (
	req = [2]api.GenerateRequest{
		{
			Model:   "orca-mini",
			Prompt:  "tell me a short story about agi?",
			Options: map[string]interface{}{},
		}, {
			Model:   "orca-mini",
			Prompt:  "what is the origin of the us thanksgiving holiday?",
			Options: map[string]interface{}{},
		},
	}
	resp = [2]string{
		"once upon a time",
		"united states thanksgiving",
	}
)

func TestIntegrationSimpleOrcaMini(t *testing.T) {
	SkipIFNoTestData(t)
	workDir, err := os.MkdirTemp("", "ollama")
	require.NoError(t, err)
	defer os.RemoveAll(workDir)
	require.NoError(t, llm.Init(workDir))
	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
	defer cancel()
	opts := api.DefaultOptions()
	opts.Seed = 42
	opts.Temperature = 0.0
	model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts)
	defer llmRunner.Close()
	response := OneShotPromptResponse(t, ctx, req[0], model, llmRunner)
	assert.Contains(t, strings.ToLower(response), resp[0])
}

// TODO
// The server always loads a new runner and closes the old one, which forces serial execution
// At present this test case fails with concurrency problems.  Eventually we should try to
// get true concurrency working with n_parallel support in the backend
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
	SkipIFNoTestData(t)

	t.Skip("concurrent prediction on single runner not currently supported")

	workDir, err := os.MkdirTemp("", "ollama")
	require.NoError(t, err)
	defer os.RemoveAll(workDir)
	require.NoError(t, llm.Init(workDir))
	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
	defer cancel()
	opts := api.DefaultOptions()
	opts.Seed = 42
	opts.Temperature = 0.0
	var wg sync.WaitGroup
	wg.Add(len(req))
	model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts)
	defer llmRunner.Close()
	for i := 0; i < len(req); i++ {
		go func(i int) {
			defer wg.Done()
			response := OneShotPromptResponse(t, ctx, req[i], model, llmRunner)
			t.Logf("Prompt: %s\nResponse: %s", req[0].Prompt, response)
			assert.Contains(t, strings.ToLower(response), resp[i], "error in thread %d (%s)", i, req[i].Prompt)
		}(i)
	}
	wg.Wait()
}

func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) {
	SkipIFNoTestData(t)
	workDir, err := os.MkdirTemp("", "ollama")
	require.NoError(t, err)
	defer os.RemoveAll(workDir)
	require.NoError(t, llm.Init(workDir))
	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
	defer cancel()
	opts := api.DefaultOptions()
	opts.Seed = 42
	opts.Temperature = 0.0
	var wg sync.WaitGroup
	wg.Add(len(req))

	t.Logf("Running %d concurrently", len(req))
	for i := 0; i < len(req); i++ {
		go func(i int) {
			defer wg.Done()
			model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts)
			defer llmRunner.Close()
			response := OneShotPromptResponse(t, ctx, req[i], model, llmRunner)
			t.Logf("Prompt: %s\nResponse: %s", req[0].Prompt, response)
			assert.Contains(t, strings.ToLower(response), resp[i], "error in thread %d (%s)", i, req[i].Prompt)
		}(i)
	}
	wg.Wait()
}

// TODO - create a parallel test with 2 different models once we support concurrency
Guard integration tests with a tag This should help CI avoid running the integration test logic in a container where it's not currently possible. 2023-12-23 00:33:27 +00:00			`//go:build integration`

Add cgo implementation for llama.cpp Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions. 2023-11-14 01:20:34 +00:00			`package server`

			`import (`
			`"context"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`"os"`
Add cgo implementation for llama.cpp Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions. 2023-11-14 01:20:34 +00:00			`"strings"`
			`"sync"`
			`"testing"`
			`"time"`

			`"github.com/stretchr/testify/assert"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`"github.com/stretchr/testify/require"`
Add cgo implementation for llama.cpp Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions. 2023-11-14 01:20:34 +00:00
			`"github.com/jmorganca/ollama/api"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`"github.com/jmorganca/ollama/llm"`
Add cgo implementation for llama.cpp Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions. 2023-11-14 01:20:34 +00:00			`)`

			`// TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server`
			`// package to avoid circular dependencies`

			`// WARNING - these tests will fail on mac if you don't manually copy ggml-metal.metal to this dir (./server)`
			`//`
			`// TODO - Fix this ^^`

			`var (`
			`req = [2]api.GenerateRequest{`
			`{`
			`Model: "orca-mini",`
			`Prompt: "tell me a short story about agi?",`
			`Options: map[string]interface{}{},`
			`}, {`
			`Model: "orca-mini",`
			`Prompt: "what is the origin of the us thanksgiving holiday?",`
			`Options: map[string]interface{}{},`
			`},`
			`}`
			`resp = [2]string{`
			`"once upon a time",`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`"united states thanksgiving",`
Add cgo implementation for llama.cpp Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions. 2023-11-14 01:20:34 +00:00			`}`
			`)`

			`func TestIntegrationSimpleOrcaMini(t *testing.T) {`
			`SkipIFNoTestData(t)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`workDir, err := os.MkdirTemp("", "ollama")`
			`require.NoError(t, err)`
			`defer os.RemoveAll(workDir)`
			`require.NoError(t, llm.Init(workDir))`
Add cgo implementation for llama.cpp Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions. 2023-11-14 01:20:34 +00:00			`ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)`
			`defer cancel()`
			`opts := api.DefaultOptions()`
			`opts.Seed = 42`
			`opts.Temperature = 0.0`
			`model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts)`
			`defer llmRunner.Close()`
			`response := OneShotPromptResponse(t, ctx, req[0], model, llmRunner)`
			`assert.Contains(t, strings.ToLower(response), resp[0])`
			`}`

			`// TODO`
			`// The server always loads a new runner and closes the old one, which forces serial execution`
			`// At present this test case fails with concurrency problems. Eventually we should try to`
			`// get true concurrency working with n_parallel support in the backend`
			`func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {`
			`SkipIFNoTestData(t)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00
Add cgo implementation for llama.cpp Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions. 2023-11-14 01:20:34 +00:00			`t.Skip("concurrent prediction on single runner not currently supported")`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00
			`workDir, err := os.MkdirTemp("", "ollama")`
			`require.NoError(t, err)`
			`defer os.RemoveAll(workDir)`
			`require.NoError(t, llm.Init(workDir))`
Add cgo implementation for llama.cpp Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions. 2023-11-14 01:20:34 +00:00			`ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)`
			`defer cancel()`
			`opts := api.DefaultOptions()`
			`opts.Seed = 42`
			`opts.Temperature = 0.0`
			`var wg sync.WaitGroup`
			`wg.Add(len(req))`
			`model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts)`
			`defer llmRunner.Close()`
			`for i := 0; i < len(req); i++ {`
			`go func(i int) {`
			`defer wg.Done()`
			`response := OneShotPromptResponse(t, ctx, req[i], model, llmRunner)`
			`t.Logf("Prompt: %s\nResponse: %s", req[0].Prompt, response)`
			`assert.Contains(t, strings.ToLower(response), resp[i], "error in thread %d (%s)", i, req[i].Prompt)`
			`}(i)`
			`}`
			`wg.Wait()`
			`}`

			`func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) {`
			`SkipIFNoTestData(t)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`workDir, err := os.MkdirTemp("", "ollama")`
			`require.NoError(t, err)`
			`defer os.RemoveAll(workDir)`
			`require.NoError(t, llm.Init(workDir))`
Add cgo implementation for llama.cpp Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions. 2023-11-14 01:20:34 +00:00			`ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)`
			`defer cancel()`
			`opts := api.DefaultOptions()`
			`opts.Seed = 42`
			`opts.Temperature = 0.0`
			`var wg sync.WaitGroup`
			`wg.Add(len(req))`

Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`t.Logf("Running %d concurrently", len(req))`
Add cgo implementation for llama.cpp Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions. 2023-11-14 01:20:34 +00:00			`for i := 0; i < len(req); i++ {`
			`go func(i int) {`
			`defer wg.Done()`
			`model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts)`
			`defer llmRunner.Close()`
			`response := OneShotPromptResponse(t, ctx, req[i], model, llmRunner)`
			`t.Logf("Prompt: %s\nResponse: %s", req[0].Prompt, response)`
			`assert.Contains(t, strings.ToLower(response), resp[i], "error in thread %d (%s)", i, req[i].Prompt)`
			`}(i)`
			`}`
			`wg.Wait()`
			`}`

			`// TODO - create a parallel test with 2 different models once we support concurrency`