Merge pull request #783 from jmorganca/mxyng/fix-gpu-offloading

fix: offloading on low end GPUs
This commit is contained in:
Michael Yang 2023-10-13 14:36:44 -07:00 committed by GitHub
commit d790bf9916
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -31,41 +31,42 @@ var llamaCppEmbed embed.FS
type ModelRunner struct { type ModelRunner struct {
Path string // path to the model runner executable Path string // path to the model runner executable
Accelerated bool
} }
func chooseRunners(workDir, runnerType string) []ModelRunner { func chooseRunners(workDir, runnerType string) []ModelRunner {
buildPath := path.Join("llama.cpp", runnerType, "build") buildPath := path.Join("llama.cpp", runnerType, "build")
var runners []string var runners []ModelRunner
// set the runners based on the OS // set the runners based on the OS
// IMPORTANT: the order of the runners in the array is the priority order // IMPORTANT: the order of the runners in the array is the priority order
switch runtime.GOOS { switch runtime.GOOS {
case "darwin": case "darwin":
runners = []string{ runners = []ModelRunner{
path.Join(buildPath, "metal", "bin", "ollama-runner"), {Path: path.Join(buildPath, "metal", "bin", "ollama-runner")},
path.Join(buildPath, "cpu", "bin", "ollama-runner"), {Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
} }
case "linux": case "linux":
runners = []string{ runners = []ModelRunner{
path.Join(buildPath, "cuda", "bin", "ollama-runner"), {Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
path.Join(buildPath, "cpu", "bin", "ollama-runner"), {Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
} }
case "windows": case "windows":
// TODO: select windows GPU runner here when available // TODO: select windows GPU runner here when available
runners = []string{ runners = []ModelRunner{
path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe"), {Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
} }
default: default:
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS) log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
runners = []string{ runners = []ModelRunner{
path.Join(buildPath, "cpu", "bin", "ollama-runner"), {Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
} }
} }
runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
for _, r := range runners { for _, r := range runners {
// find all the files in the runner's bin directory // find all the files in the runner's bin directory
files, err := fs.Glob(llamaCppEmbed, path.Join(path.Dir(r), "*")) files, err := fs.Glob(llamaCppEmbed, path.Join(path.Dir(r.Path), "*"))
if err != nil { if err != nil {
// this is expected, ollama may be compiled without all runners packed in // this is expected, ollama may be compiled without all runners packed in
log.Printf("%s runner not found: %v", r, err) log.Printf("%s runner not found: %v", r, err)
@ -115,7 +116,10 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
localRunnersByPriority := []ModelRunner{} localRunnersByPriority := []ModelRunner{}
for _, r := range runners { for _, r := range runners {
// clean the ModelRunner paths so that they match the OS we are running on // clean the ModelRunner paths so that they match the OS we are running on
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{Path: filepath.Clean(path.Join(workDir, r))}) localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
Path: filepath.Clean(path.Join(workDir, r.Path)),
Accelerated: r.Accelerated,
})
} }
return localRunnersByPriority return localRunnersByPriority
@ -215,6 +219,11 @@ func CheckVRAM() (int64, error) {
free += vram free += vram
} }
if free*1024*1024 < 2*1000*1000*1000 {
log.Printf("less than 2 GB VRAM available, falling back to CPU only")
free = 0
}
return free, nil return free, nil
} }
@ -276,16 +285,20 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
} }
numGPU := NumGPU(numLayers, fileInfo.Size(), opts)
params := []string{ params := []string{
"--model", model, "--model", model,
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx), "--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase), "--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale), "--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
"--batch-size", fmt.Sprintf("%d", opts.NumBatch), "--batch-size", fmt.Sprintf("%d", opts.NumBatch),
"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(numLayers, fileInfo.Size(), opts)),
"--embedding", "--embedding",
} }
if numGPU > 0 {
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", numGPU))
}
if opts.NumGQA > 0 { if opts.NumGQA > 0 {
params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA)) params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
} }
@ -316,6 +329,11 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
// start the llama.cpp server with a retry in case the port is already in use // start the llama.cpp server with a retry in case the port is already in use
for _, runner := range runners { for _, runner := range runners {
if runner.Accelerated && numGPU == 0 {
log.Printf("skipping accelerated runner because num_gpu=0")
continue
}
if _, err := os.Stat(runner.Path); err != nil { if _, err := os.Stat(runner.Path); err != nil {
log.Printf("llama runner not found: %v", err) log.Printf("llama runner not found: %v", err)
continue continue