34b9db5afc
This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS.
34 lines
820 B
Go
34 lines
820 B
Go
package gpu
|
|
|
|
import (
|
|
"runtime"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
)
|
|
|
|
func TestBasicGetGPUInfo(t *testing.T) {
|
|
info := GetGPUInfo()
|
|
assert.Greater(t, len(info), 0)
|
|
assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
|
|
if info[0].Library != "cpu" {
|
|
assert.Greater(t, info[0].TotalMemory, uint64(0))
|
|
assert.Greater(t, info[0].FreeMemory, uint64(0))
|
|
}
|
|
}
|
|
|
|
func TestCPUMemInfo(t *testing.T) {
|
|
info, err := GetCPUMem()
|
|
assert.NoError(t, err)
|
|
switch runtime.GOOS {
|
|
case "darwin":
|
|
t.Skip("CPU memory not populated on darwin")
|
|
case "linux", "windows":
|
|
assert.Greater(t, info.TotalMemory, uint64(0))
|
|
assert.Greater(t, info.FreeMemory, uint64(0))
|
|
default:
|
|
return
|
|
}
|
|
}
|
|
|
|
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
|