support for packaging in multiple cuda runners (#509)
* enable packaging multiple cuda versions * use nvcc cuda version if available --------- Co-authored-by: Michael Yang <mxyng@pm.me>
This commit is contained in:
parent
83ffb154bc
commit
2540c9181c
5 changed files with 96 additions and 38 deletions
|
@ -35,5 +35,5 @@ Now you can run `ollama`:
|
||||||
## Building on Linux with GPU support
|
## Building on Linux with GPU support
|
||||||
|
|
||||||
- Install cmake and nvidia-cuda-toolkit
|
- Install cmake and nvidia-cuda-toolkit
|
||||||
- run `go generate ./...`
|
- run `CUDA_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\)\.\([0-9]\+\).*$/\1/p') go generate ./...`
|
||||||
- run `go build .`
|
- run `go build .`
|
||||||
|
|
|
@ -4,7 +4,6 @@ import (
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"errors"
|
"errors"
|
||||||
"io"
|
"io"
|
||||||
"path"
|
|
||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -166,11 +165,6 @@ func (c *containerLORA) Decode(r io.Reader) (model, error) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
|
||||||
ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
|
|
||||||
ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
ggmlInit sync.Once
|
ggmlInit sync.Once
|
||||||
ggmlRunnerPath string
|
ggmlRunnerPath string
|
||||||
|
@ -178,7 +172,7 @@ var (
|
||||||
|
|
||||||
func ggmlRunner() ModelRunner {
|
func ggmlRunner() ModelRunner {
|
||||||
ggmlInit.Do(func() {
|
ggmlInit.Do(func() {
|
||||||
ggmlRunnerPath = chooseRunner(ggmlGPU, ggmlCPU)
|
ggmlRunnerPath = chooseRunner("ggml")
|
||||||
})
|
})
|
||||||
return ModelRunner{Path: ggmlRunnerPath}
|
return ModelRunner{Path: ggmlRunnerPath}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,6 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"path"
|
|
||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -370,11 +369,6 @@ func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
|
||||||
ggufGPU = path.Join("llama.cpp", "gguf", "build", "gpu", "bin")
|
|
||||||
ggufCPU = path.Join("llama.cpp", "gguf", "build", "cpu", "bin")
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
ggufInit sync.Once
|
ggufInit sync.Once
|
||||||
ggufRunnerPath string
|
ggufRunnerPath string
|
||||||
|
@ -382,7 +376,7 @@ var (
|
||||||
|
|
||||||
func ggufRunner() ModelRunner {
|
func ggufRunner() ModelRunner {
|
||||||
ggufInit.Do(func() {
|
ggufInit.Do(func() {
|
||||||
ggufRunnerPath = chooseRunner(ggufGPU, ggufCPU)
|
ggufRunnerPath = chooseRunner("gguf")
|
||||||
})
|
})
|
||||||
|
|
||||||
return ModelRunner{Path: ggufRunnerPath}
|
return ModelRunner{Path: ggufRunnerPath}
|
||||||
|
|
|
@ -7,9 +7,15 @@ package llm
|
||||||
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
|
//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
|
||||||
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
|
||||||
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||||
|
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||||
|
|
||||||
//go:generate git submodule update --force gguf
|
//go:generate git submodule update --force gguf
|
||||||
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
|
||||||
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||||
|
|
||||||
|
//go:generate cmake -S ggml -B ggml/build/cuda-${CUDA_VERSION} -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||||
|
//go:generate cmake --build ggml/build/cuda-${CUDA_VERSION} --target server --config Release
|
||||||
|
//go:generate cmake -S gguf -B gguf/build/cuda-${CUDA_VERSION} -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||||
|
//go:generate cmake --build gguf/build/cuda-${CUDA_VERSION} --target server --config Release
|
||||||
|
|
102
llm/llama.go
102
llm/llama.go
|
@ -17,6 +17,7 @@ import (
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path"
|
"path"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
"runtime"
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
@ -36,36 +37,99 @@ func osPath(llamaPath string) string {
|
||||||
return llamaPath
|
return llamaPath
|
||||||
}
|
}
|
||||||
|
|
||||||
func chooseRunner(gpuPath, cpuPath string) string {
|
func cudaVersion() (int, error) {
|
||||||
|
// first try nvcc, it gives the most accurate version if available
|
||||||
|
cmd := exec.Command("nvcc", "--version")
|
||||||
|
output, err := cmd.CombinedOutput()
|
||||||
|
if err == nil {
|
||||||
|
// regex to match the CUDA version line in nvcc --version output
|
||||||
|
re := regexp.MustCompile(`release (\d+\.\d+),`)
|
||||||
|
matches := re.FindStringSubmatch(string(output))
|
||||||
|
if len(matches) >= 2 {
|
||||||
|
cudaVersion := matches[1]
|
||||||
|
cudaVersionParts := strings.Split(cudaVersion, ".")
|
||||||
|
cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0])
|
||||||
|
if err == nil {
|
||||||
|
return cudaMajorVersion, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// fallback to nvidia-smi
|
||||||
|
cmd = exec.Command("nvidia-smi")
|
||||||
|
output, err = cmd.CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return -1, err
|
||||||
|
}
|
||||||
|
|
||||||
|
re := regexp.MustCompile(`CUDA Version: (\d+\.\d+)`)
|
||||||
|
matches := re.FindStringSubmatch(string(output))
|
||||||
|
if len(matches) < 2 {
|
||||||
|
return -1, errors.New("could not find CUDA version")
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaVersion := matches[1]
|
||||||
|
cudaVersionParts := strings.Split(cudaVersion, ".")
|
||||||
|
cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0])
|
||||||
|
if err != nil {
|
||||||
|
return -1, err
|
||||||
|
}
|
||||||
|
return cudaMajorVersion, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func chooseRunner(runnerType string) string {
|
||||||
tmpDir, err := os.MkdirTemp("", "llama-*")
|
tmpDir, err := os.MkdirTemp("", "llama-*")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
|
log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
llamaPath := osPath(gpuPath)
|
cpuPath := osPath(path.Join("llama.cpp", runnerType, "build", "cpu", "bin"))
|
||||||
|
llamaPath := cpuPath
|
||||||
|
files := []string{"server"}
|
||||||
|
|
||||||
|
// Set OS specific llama.cpp runner paths
|
||||||
|
switch runtime.GOOS {
|
||||||
|
case "darwin":
|
||||||
|
// TODO: change to check metal version
|
||||||
|
llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", "gpu", "bin"))
|
||||||
|
files = append(files, "ggml-metal.metal")
|
||||||
|
case "linux":
|
||||||
|
cudaVersion, err := cudaVersion()
|
||||||
|
if err != nil {
|
||||||
|
// fallback to CPU runner in the following the CUDA version check
|
||||||
|
log.Printf("failed to get CUDA version: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
switch cudaVersion {
|
||||||
|
case 11, 12:
|
||||||
|
cudaDir := fmt.Sprintf("cuda-%d", cudaVersion)
|
||||||
|
llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", cudaDir, "bin"))
|
||||||
|
default:
|
||||||
|
if cudaVersion != -1 {
|
||||||
|
// a valid version was returned but it is not supported
|
||||||
|
log.Printf("CUDA version %d not supported, falling back to CPU", cudaVersion)
|
||||||
|
}
|
||||||
|
llamaPath = cpuPath
|
||||||
|
}
|
||||||
|
case "windows":
|
||||||
|
// TODO: select windows GPU runner here when available
|
||||||
|
files = []string{"server.exe"}
|
||||||
|
default:
|
||||||
|
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if the runner exists, if not fallback to CPU runner
|
||||||
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
||||||
llamaPath = osPath(cpuPath)
|
// fallback to CPU runner
|
||||||
|
llamaPath = cpuPath
|
||||||
|
files = []string{"server"}
|
||||||
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
|
||||||
log.Fatalf("llama.cpp executable not found")
|
log.Fatalf("llama.cpp executable not found")
|
||||||
}
|
}
|
||||||
|
log.Printf("llama.cpp %s executable not found, falling back to cpu", runnerType)
|
||||||
}
|
}
|
||||||
|
|
||||||
files := []string{"server"}
|
// copy the files locally to run the llama.cpp server
|
||||||
switch runtime.GOOS {
|
|
||||||
case "windows":
|
|
||||||
files = []string{"server.exe"}
|
|
||||||
case "darwin":
|
|
||||||
if llamaPath == osPath(gpuPath) {
|
|
||||||
files = append(files, "ggml-metal.metal")
|
|
||||||
}
|
|
||||||
case "linux":
|
|
||||||
// check if there is a GPU available
|
|
||||||
if _, err := CheckVRAM(); errors.Is(err, errNoGPU) {
|
|
||||||
// this error was logged on start-up, so we don't need to log it again
|
|
||||||
llamaPath = osPath(cpuPath)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, f := range files {
|
for _, f := range files {
|
||||||
srcPath := path.Join(llamaPath, f)
|
srcPath := path.Join(llamaPath, f)
|
||||||
destPath := filepath.Join(tmpDir, f)
|
destPath := filepath.Join(tmpDir, f)
|
||||||
|
|
Loading…
Reference in a new issue