subprocess improvements (#524)

* subprocess improvements

- increase start-up timeout
- when runner fails to start fail rather than timing out
- try runners in order rather than choosing 1 runner
- embed metal runner in metal dir rather than gpu
- refactor logging and error messages

* Update llama.go

* Update llama.go

* simplify by using glob
This commit is contained in:
Bruce MacDonald 2023-09-18 15:16:32 -04:00 committed by GitHub
parent c345053a8b
commit 66003e1d05
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 121 additions and 106 deletions

View file

@ -167,14 +167,14 @@ func (c *containerLORA) Decode(r io.Reader) (model, error) {
var ( var (
ggmlInit sync.Once ggmlInit sync.Once
ggmlRunnerPath string ggmlRunners []ModelRunner // a slice of ModelRunners ordered by priority
) )
func ggmlRunner() ModelRunner { func ggmlRunner() []ModelRunner {
ggmlInit.Do(func() { ggmlInit.Do(func() {
ggmlRunnerPath = chooseRunner("ggml") ggmlRunners = chooseRunners("ggml")
}) })
return ModelRunner{Path: ggmlRunnerPath} return ggmlRunners
} }
const ( const (

View file

@ -371,13 +371,13 @@ func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
var ( var (
ggufInit sync.Once ggufInit sync.Once
ggufRunnerPath string ggufRunners []ModelRunner // a slice of ModelRunners ordered by priority
) )
func ggufRunner() ModelRunner { func ggufRunner() []ModelRunner {
ggufInit.Do(func() { ggufInit.Do(func() {
ggufRunnerPath = chooseRunner("gguf") ggufRunners = chooseRunners("gguf")
}) })
return ModelRunner{Path: ggufRunnerPath} return ggufRunners
} }

View file

@ -8,9 +8,9 @@ package llm
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch //go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch //go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch //go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 //go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/gpu --target server --config Release //go:generate cmake --build ggml/build/metal --target server --config Release
//go:generate git submodule update --force gguf //go:generate git submodule update --force gguf
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 //go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build gguf/build/gpu --target server --config Release //go:generate cmake --build gguf/build/metal --target server --config Release

View file

@ -29,15 +29,7 @@ import (
//go:embed llama.cpp/*/build/*/bin/* //go:embed llama.cpp/*/build/*/bin/*
var llamaCppEmbed embed.FS var llamaCppEmbed embed.FS
func osPath(llamaPath string) string { func cudaVersion() int {
if runtime.GOOS == "windows" {
return path.Join(llamaPath, "Release")
}
return llamaPath
}
func cudaVersion() (int, error) {
// first try nvcc, it gives the most accurate version if available // first try nvcc, it gives the most accurate version if available
cmd := exec.Command("nvcc", "--version") cmd := exec.Command("nvcc", "--version")
output, err := cmd.CombinedOutput() output, err := cmd.CombinedOutput()
@ -50,7 +42,7 @@ func cudaVersion() (int, error) {
cudaVersionParts := strings.Split(cudaVersion, ".") cudaVersionParts := strings.Split(cudaVersion, ".")
cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0]) cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0])
if err == nil { if err == nil {
return cudaMajorVersion, nil return cudaMajorVersion
} }
} }
} }
@ -59,104 +51,118 @@ func cudaVersion() (int, error) {
cmd = exec.Command("nvidia-smi") cmd = exec.Command("nvidia-smi")
output, err = cmd.CombinedOutput() output, err = cmd.CombinedOutput()
if err != nil { if err != nil {
return -1, err return -1
} }
re := regexp.MustCompile(`CUDA Version: (\d+\.\d+)`) re := regexp.MustCompile(`CUDA Version: (\d+\.\d+)`)
matches := re.FindStringSubmatch(string(output)) matches := re.FindStringSubmatch(string(output))
if len(matches) < 2 { if len(matches) < 2 {
return -1, errors.New("could not find CUDA version") return -1
} }
cudaVersion := matches[1] cudaVersion := matches[1]
cudaVersionParts := strings.Split(cudaVersion, ".") cudaVersionParts := strings.Split(cudaVersion, ".")
cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0]) cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0])
if err != nil { if err != nil {
return -1, err return -1
} }
return cudaMajorVersion, nil return cudaMajorVersion
} }
func chooseRunner(runnerType string) string { type ModelRunner struct {
tmpDir, err := os.MkdirTemp("", "llama-*") Path string // path to the model runner executable
if err != nil {
log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
} }
cpuPath := osPath(path.Join("llama.cpp", runnerType, "build", "cpu", "bin")) func chooseRunners(runnerType string) []ModelRunner {
llamaPath := cpuPath buildPath := path.Join("llama.cpp", runnerType, "build")
files := []string{"server"} var runners []string
// Set OS specific llama.cpp runner paths // set the runners based on the OS
// IMPORTANT: the order of the runners in the array is the priority order
switch runtime.GOOS { switch runtime.GOOS {
case "darwin": case "darwin":
// TODO: change to check metal version runners = []string{
llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", "gpu", "bin")) path.Join(buildPath, "metal", "bin", "server"),
files = append(files, "ggml-metal.metal") path.Join(buildPath, "cpu", "bin", "server"),
}
case "linux": case "linux":
cudaVersion, err := cudaVersion() cuda := cudaVersion()
if err != nil { if cuda == 11 {
// fallback to CPU runner in the following the CUDA version check // prioritize CUDA 11 runner
log.Printf("failed to get CUDA version: %v", err) runners = []string{
path.Join(buildPath, "cuda-11", "bin", "server"),
path.Join(buildPath, "cuda-12", "bin", "server"),
path.Join(buildPath, "cpu", "bin", "server"),
} }
} else {
switch cudaVersion { runners = []string{
case 11, 12: path.Join(buildPath, "cuda-12", "bin", "server"),
cudaDir := fmt.Sprintf("cuda-%d", cudaVersion) path.Join(buildPath, "cuda-11", "bin", "server"),
llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", cudaDir, "bin")) path.Join(buildPath, "cpu", "bin", "server"),
default:
if cudaVersion != -1 {
// a valid version was returned but it is not supported
log.Printf("CUDA version %d not supported, falling back to CPU", cudaVersion)
} }
llamaPath = cpuPath
} }
case "windows": case "windows":
// TODO: select windows GPU runner here when available // TODO: select windows GPU runner here when available
files = []string{"server.exe"} runners = []string{
path.Join(buildPath, "cpu", "bin", "Release", "server.exe"),
}
default: default:
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS) log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
runners = []string{
path.Join(buildPath, "cpu", "bin", "server"),
} }
// check if the runner exists, if not fallback to CPU runner
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
// fallback to CPU runner
llamaPath = cpuPath
files = []string{"server"}
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
log.Fatalf("llama.cpp executable not found")
}
log.Printf("llama.cpp %s executable not found, falling back to cpu", runnerType)
} }
// copy the files locally to run the llama.cpp server // copy the files locally to run the llama.cpp server
for _, f := range files { tmpDir, err := os.MkdirTemp("", "llama-*")
srcPath := path.Join(llamaPath, f)
destPath := filepath.Join(tmpDir, f)
srcFile, err := llamaCppEmbed.Open(srcPath)
if err != nil { if err != nil {
log.Fatalf("read llama.cpp %s: %v", f, err) log.Fatalf("load llama runner: failed to create temp dir: %v", err)
}
runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
for _, r := range runners {
// find all the files in the runner's bin directory
files, err := fs.Glob(llamaCppEmbed, filepath.Join(filepath.Dir(r), "*"))
if err != nil {
// this is expected, ollama may be compiled without all runners packed in
log.Printf("%s runner not found: %v", r, err)
continue
}
runnerAvailable = true
for _, f := range files {
srcFile, err := llamaCppEmbed.Open(f)
if err != nil {
log.Fatalf("read llama runner %s: %v", f, err)
} }
defer srcFile.Close() defer srcFile.Close()
destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) // create the directory in case it does not exist
destPath := filepath.Join(tmpDir, filepath.Dir(f))
if err := os.MkdirAll(destPath, 0o755); err != nil {
log.Fatalf("create runner temp dir %s: %v", filepath.Dir(f), err)
}
destFile, err := os.OpenFile(filepath.Join(destPath, filepath.Base(f)), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil { if err != nil {
log.Fatalf("write llama.cpp %s: %v", f, err) log.Fatalf("write llama runner %s: %v", f, err)
} }
defer destFile.Close() defer destFile.Close()
if _, err := io.Copy(destFile, srcFile); err != nil { if _, err := io.Copy(destFile, srcFile); err != nil {
log.Fatalf("copy llama.cpp %s: %v", f, err) log.Fatalf("copy llama runner %s: %v", f, err)
} }
} }
}
if !runnerAvailable {
log.Fatalf("%s runner not found", runnerType)
}
runPath := filepath.Join(tmpDir, "server") // return the runners to try in priority order
if runtime.GOOS == "windows" { localRunnersByPriority := []ModelRunner{}
runPath = filepath.Join(tmpDir, "server.exe") for _, r := range runners {
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{Path: path.Join(tmpDir, r)})
} }
return runPath return localRunnersByPriority
} }
type llamaModel struct { type llamaModel struct {
@ -217,10 +223,6 @@ type Running struct {
Cancel context.CancelFunc Cancel context.CancelFunc
} }
type ModelRunner struct {
Path string // path to the model runner executable
}
type llama struct { type llama struct {
api.Options api.Options
Running Running
@ -292,15 +294,11 @@ func NumGPU(opts api.Options) int {
return n return n
} }
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) { func newLlama(model string, adapters []string, runners []ModelRunner, opts api.Options) (*llama, error) {
if _, err := os.Stat(model); err != nil { if _, err := os.Stat(model); err != nil {
return nil, err return nil, err
} }
if _, err := os.Stat(runner.Path); err != nil {
return nil, err
}
if len(adapters) > 1 { if len(adapters) > 1 {
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
} }
@ -342,7 +340,12 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
} }
// start the llama.cpp server with a retry in case the port is already in use // start the llama.cpp server with a retry in case the port is already in use
for try := 0; try < 3; try++ { for _, runner := range runners {
if _, err := os.Stat(runner.Path); err != nil {
log.Printf("llama runner not found: %v", err)
continue
}
port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
cmd := exec.CommandContext( cmd := exec.CommandContext(
@ -356,14 +359,24 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}} llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}
log.Print("starting llama.cpp server") log.Print("starting llama runner")
if err := llm.Cmd.Start(); err != nil { if err := llm.Cmd.Start(); err != nil {
log.Printf("error starting the external llama.cpp server: %v", err) log.Printf("error starting the external llama runner: %v", err)
continue continue
} }
// monitor the command, it is blocking, so if it exits we need to capture that
go func() {
err := llm.Cmd.Wait() // this will block until the command exits
if err != nil {
log.Printf("llama runner exited with error: %v", err)
} else {
log.Printf("llama runner exited")
}
}()
if err := waitForServer(llm); err != nil { if err := waitForServer(llm); err != nil {
log.Printf("error starting llama.cpp server: %v", err) log.Printf("error starting llama runner: %v", err)
llm.Close() llm.Close()
// try again // try again
continue continue
@ -373,19 +386,24 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
return llm, nil return llm, nil
} }
return nil, fmt.Errorf("max retry exceeded starting llama.cpp") return nil, fmt.Errorf("failed to start a llama runner")
} }
func waitForServer(llm *llama) error { func waitForServer(llm *llama) error {
// wait for the server to start responding // wait for the server to start responding
start := time.Now() start := time.Now()
expiresAt := time.Now().Add(45 * time.Second) expiresAt := time.Now().Add(2 * time.Minute) // be generous with timeout, large models can take a while to load
ticker := time.NewTicker(200 * time.Millisecond) ticker := time.NewTicker(200 * time.Millisecond)
log.Print("waiting for llama.cpp server to start responding") log.Print("waiting for llama runner to start responding")
for range ticker.C { for range ticker.C {
if time.Now().After(expiresAt) { if time.Now().After(expiresAt) {
return fmt.Errorf("llama.cpp server did not start within alloted time, retrying") return fmt.Errorf("llama runner did not start within alloted time, retrying")
}
// check if the server process has terminated
if llm.Cmd.ProcessState != nil && llm.Cmd.ProcessState.Exited() {
return fmt.Errorf("llama runner process has terminated")
} }
if err := llm.Ping(context.Background()); err == nil { if err := llm.Ping(context.Background()); err == nil {
@ -393,15 +411,12 @@ func waitForServer(llm *llama) error {
} }
} }
log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds()) log.Printf("llama runner started in %f seconds", time.Since(start).Seconds())
return nil return nil
} }
func (llm *llama) Close() { func (llm *llama) Close() {
llm.Cancel() llm.Cancel()
if err := llm.Cmd.Wait(); err != nil {
log.Printf("llama.cpp server exited with error: %v", err)
}
} }
func (llm *llama) SetOptions(opts api.Options) { func (llm *llama) SetOptions(opts api.Options) {