support for packaging in multiple cuda runners (#509)

* enable packaging multiple cuda versions
* use nvcc cuda version if available

---------

Co-authored-by: Michael Yang <mxyng@pm.me>
This commit is contained in:
Bruce MacDonald 2023-09-14 15:08:13 -04:00 committed by GitHub
parent 83ffb154bc
commit 2540c9181c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 96 additions and 38 deletions

View file

@ -35,5 +35,5 @@ Now you can run `ollama`:
## Building on Linux with GPU support ## Building on Linux with GPU support
- Install cmake and nvidia-cuda-toolkit - Install cmake and nvidia-cuda-toolkit
- run `go generate ./...` - run `CUDA_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\)\.\([0-9]\+\).*$/\1/p') go generate ./...`
- run `go build .` - run `go build .`

View file

@ -4,7 +4,6 @@ import (
"encoding/binary" "encoding/binary"
"errors" "errors"
"io" "io"
"path"
"sync" "sync"
) )
@ -166,11 +165,6 @@ func (c *containerLORA) Decode(r io.Reader) (model, error) {
return nil, nil return nil, nil
} }
var (
ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
)
var ( var (
ggmlInit sync.Once ggmlInit sync.Once
ggmlRunnerPath string ggmlRunnerPath string
@ -178,7 +172,7 @@ var (
func ggmlRunner() ModelRunner { func ggmlRunner() ModelRunner {
ggmlInit.Do(func() { ggmlInit.Do(func() {
ggmlRunnerPath = chooseRunner(ggmlGPU, ggmlCPU) ggmlRunnerPath = chooseRunner("ggml")
}) })
return ModelRunner{Path: ggmlRunnerPath} return ModelRunner{Path: ggmlRunnerPath}
} }

View file

@ -6,7 +6,6 @@ import (
"errors" "errors"
"fmt" "fmt"
"io" "io"
"path"
"sync" "sync"
) )
@ -370,11 +369,6 @@ func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
return return
} }
var (
ggufGPU = path.Join("llama.cpp", "gguf", "build", "gpu", "bin")
ggufCPU = path.Join("llama.cpp", "gguf", "build", "cpu", "bin")
)
var ( var (
ggufInit sync.Once ggufInit sync.Once
ggufRunnerPath string ggufRunnerPath string
@ -382,7 +376,7 @@ var (
func ggufRunner() ModelRunner { func ggufRunner() ModelRunner {
ggufInit.Do(func() { ggufInit.Do(func() {
ggufRunnerPath = chooseRunner(ggufGPU, ggufCPU) ggufRunnerPath = chooseRunner("gguf")
}) })
return ModelRunner{Path: ggufRunnerPath} return ModelRunner{Path: ggufRunnerPath}

View file

@ -7,9 +7,15 @@ package llm
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch //go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch //go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch //go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/gpu --target server --config Release //go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate git submodule update --force gguf //go:generate git submodule update --force gguf
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/gpu --target server --config Release //go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate cmake -S ggml -B ggml/build/cuda-${CUDA_VERSION} -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cuda-${CUDA_VERSION} --target server --config Release
//go:generate cmake -S gguf -B gguf/build/cuda-${CUDA_VERSION} -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/cuda-${CUDA_VERSION} --target server --config Release

View file

@ -17,6 +17,7 @@ import (
"os/exec" "os/exec"
"path" "path"
"path/filepath" "path/filepath"
"regexp"
"runtime" "runtime"
"strconv" "strconv"
"strings" "strings"
@ -36,36 +37,99 @@ func osPath(llamaPath string) string {
return llamaPath return llamaPath
} }
func chooseRunner(gpuPath, cpuPath string) string { func cudaVersion() (int, error) {
// first try nvcc, it gives the most accurate version if available
cmd := exec.Command("nvcc", "--version")
output, err := cmd.CombinedOutput()
if err == nil {
// regex to match the CUDA version line in nvcc --version output
re := regexp.MustCompile(`release (\d+\.\d+),`)
matches := re.FindStringSubmatch(string(output))
if len(matches) >= 2 {
cudaVersion := matches[1]
cudaVersionParts := strings.Split(cudaVersion, ".")
cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0])
if err == nil {
return cudaMajorVersion, nil
}
}
}
// fallback to nvidia-smi
cmd = exec.Command("nvidia-smi")
output, err = cmd.CombinedOutput()
if err != nil {
return -1, err
}
re := regexp.MustCompile(`CUDA Version: (\d+\.\d+)`)
matches := re.FindStringSubmatch(string(output))
if len(matches) < 2 {
return -1, errors.New("could not find CUDA version")
}
cudaVersion := matches[1]
cudaVersionParts := strings.Split(cudaVersion, ".")
cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0])
if err != nil {
return -1, err
}
return cudaMajorVersion, nil
}
func chooseRunner(runnerType string) string {
tmpDir, err := os.MkdirTemp("", "llama-*") tmpDir, err := os.MkdirTemp("", "llama-*")
if err != nil { if err != nil {
log.Fatalf("llama.cpp: failed to create temp dir: %v", err) log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
} }
llamaPath := osPath(gpuPath) cpuPath := osPath(path.Join("llama.cpp", runnerType, "build", "cpu", "bin"))
llamaPath := cpuPath
files := []string{"server"}
// Set OS specific llama.cpp runner paths
switch runtime.GOOS {
case "darwin":
// TODO: change to check metal version
llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", "gpu", "bin"))
files = append(files, "ggml-metal.metal")
case "linux":
cudaVersion, err := cudaVersion()
if err != nil {
// fallback to CPU runner in the following the CUDA version check
log.Printf("failed to get CUDA version: %v", err)
}
switch cudaVersion {
case 11, 12:
cudaDir := fmt.Sprintf("cuda-%d", cudaVersion)
llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", cudaDir, "bin"))
default:
if cudaVersion != -1 {
// a valid version was returned but it is not supported
log.Printf("CUDA version %d not supported, falling back to CPU", cudaVersion)
}
llamaPath = cpuPath
}
case "windows":
// TODO: select windows GPU runner here when available
files = []string{"server.exe"}
default:
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
}
// check if the runner exists, if not fallback to CPU runner
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil { if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
llamaPath = osPath(cpuPath) // fallback to CPU runner
llamaPath = cpuPath
files = []string{"server"}
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil { if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
log.Fatalf("llama.cpp executable not found") log.Fatalf("llama.cpp executable not found")
} }
log.Printf("llama.cpp %s executable not found, falling back to cpu", runnerType)
} }
files := []string{"server"} // copy the files locally to run the llama.cpp server
switch runtime.GOOS {
case "windows":
files = []string{"server.exe"}
case "darwin":
if llamaPath == osPath(gpuPath) {
files = append(files, "ggml-metal.metal")
}
case "linux":
// check if there is a GPU available
if _, err := CheckVRAM(); errors.Is(err, errNoGPU) {
// this error was logged on start-up, so we don't need to log it again
llamaPath = osPath(cpuPath)
}
}
for _, f := range files { for _, f := range files {
srcPath := path.Join(llamaPath, f) srcPath := path.Join(llamaPath, f)
destPath := filepath.Join(tmpDir, f) destPath := filepath.Join(tmpDir, f)