Merge pull request #783 from jmorganca/mxyng/fix-gpu-offloading
fix: offloading on low end GPUs
This commit is contained in:
commit
d790bf9916
1 changed files with 33 additions and 15 deletions
46
llm/llama.go
46
llm/llama.go
|
@ -31,41 +31,42 @@ var llamaCppEmbed embed.FS
|
||||||
|
|
||||||
type ModelRunner struct {
|
type ModelRunner struct {
|
||||||
Path string // path to the model runner executable
|
Path string // path to the model runner executable
|
||||||
|
Accelerated bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func chooseRunners(workDir, runnerType string) []ModelRunner {
|
func chooseRunners(workDir, runnerType string) []ModelRunner {
|
||||||
buildPath := path.Join("llama.cpp", runnerType, "build")
|
buildPath := path.Join("llama.cpp", runnerType, "build")
|
||||||
var runners []string
|
var runners []ModelRunner
|
||||||
|
|
||||||
// set the runners based on the OS
|
// set the runners based on the OS
|
||||||
// IMPORTANT: the order of the runners in the array is the priority order
|
// IMPORTANT: the order of the runners in the array is the priority order
|
||||||
switch runtime.GOOS {
|
switch runtime.GOOS {
|
||||||
case "darwin":
|
case "darwin":
|
||||||
runners = []string{
|
runners = []ModelRunner{
|
||||||
path.Join(buildPath, "metal", "bin", "ollama-runner"),
|
{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")},
|
||||||
path.Join(buildPath, "cpu", "bin", "ollama-runner"),
|
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||||
}
|
}
|
||||||
case "linux":
|
case "linux":
|
||||||
runners = []string{
|
runners = []ModelRunner{
|
||||||
path.Join(buildPath, "cuda", "bin", "ollama-runner"),
|
{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
|
||||||
path.Join(buildPath, "cpu", "bin", "ollama-runner"),
|
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||||
}
|
}
|
||||||
case "windows":
|
case "windows":
|
||||||
// TODO: select windows GPU runner here when available
|
// TODO: select windows GPU runner here when available
|
||||||
runners = []string{
|
runners = []ModelRunner{
|
||||||
path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe"),
|
{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
|
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
|
||||||
runners = []string{
|
runners = []ModelRunner{
|
||||||
path.Join(buildPath, "cpu", "bin", "ollama-runner"),
|
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
|
runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
|
||||||
for _, r := range runners {
|
for _, r := range runners {
|
||||||
// find all the files in the runner's bin directory
|
// find all the files in the runner's bin directory
|
||||||
files, err := fs.Glob(llamaCppEmbed, path.Join(path.Dir(r), "*"))
|
files, err := fs.Glob(llamaCppEmbed, path.Join(path.Dir(r.Path), "*"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// this is expected, ollama may be compiled without all runners packed in
|
// this is expected, ollama may be compiled without all runners packed in
|
||||||
log.Printf("%s runner not found: %v", r, err)
|
log.Printf("%s runner not found: %v", r, err)
|
||||||
|
@ -115,7 +116,10 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
|
||||||
localRunnersByPriority := []ModelRunner{}
|
localRunnersByPriority := []ModelRunner{}
|
||||||
for _, r := range runners {
|
for _, r := range runners {
|
||||||
// clean the ModelRunner paths so that they match the OS we are running on
|
// clean the ModelRunner paths so that they match the OS we are running on
|
||||||
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{Path: filepath.Clean(path.Join(workDir, r))})
|
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
|
||||||
|
Path: filepath.Clean(path.Join(workDir, r.Path)),
|
||||||
|
Accelerated: r.Accelerated,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
return localRunnersByPriority
|
return localRunnersByPriority
|
||||||
|
@ -215,6 +219,11 @@ func CheckVRAM() (int64, error) {
|
||||||
free += vram
|
free += vram
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if free*1024*1024 < 2*1000*1000*1000 {
|
||||||
|
log.Printf("less than 2 GB VRAM available, falling back to CPU only")
|
||||||
|
free = 0
|
||||||
|
}
|
||||||
|
|
||||||
return free, nil
|
return free, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -276,16 +285,20 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
|
||||||
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
numGPU := NumGPU(numLayers, fileInfo.Size(), opts)
|
||||||
params := []string{
|
params := []string{
|
||||||
"--model", model,
|
"--model", model,
|
||||||
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
|
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
|
||||||
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
||||||
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
||||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||||
"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(numLayers, fileInfo.Size(), opts)),
|
|
||||||
"--embedding",
|
"--embedding",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if numGPU > 0 {
|
||||||
|
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", numGPU))
|
||||||
|
}
|
||||||
|
|
||||||
if opts.NumGQA > 0 {
|
if opts.NumGQA > 0 {
|
||||||
params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
|
params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
|
||||||
}
|
}
|
||||||
|
@ -316,6 +329,11 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
|
||||||
|
|
||||||
// start the llama.cpp server with a retry in case the port is already in use
|
// start the llama.cpp server with a retry in case the port is already in use
|
||||||
for _, runner := range runners {
|
for _, runner := range runners {
|
||||||
|
if runner.Accelerated && numGPU == 0 {
|
||||||
|
log.Printf("skipping accelerated runner because num_gpu=0")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
if _, err := os.Stat(runner.Path); err != nil {
|
if _, err := os.Stat(runner.Path); err != nil {
|
||||||
log.Printf("llama runner not found: %v", err)
|
log.Printf("llama runner not found: %v", err)
|
||||||
continue
|
continue
|
||||||
|
|
Loading…
Add table
Reference in a new issue