first pass at linux gpu support (#454)

* linux gpu support
* handle multiple gpus
* add cuda docker image (#488)
---------

Co-authored-by: Michael Yang <mxyng@pm.me>
This commit is contained in:
Bruce MacDonald 2023-09-12 11:04:35 -04:00 committed by GitHub
parent 45ac07cd02
commit f221637053
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 158 additions and 22 deletions

22
Dockerfile.cuda Normal file
View file

@ -0,0 +1,22 @@
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
WORKDIR /go/src/github.com/jmorganca/ollama
RUN apt-get update && apt-get install -y git build-essential cmake
ADD https://dl.google.com/go/go1.21.1.linux-amd64.tar.gz /tmp/go1.21.1.tar.gz
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
COPY . .
RUN /usr/local/go/bin/go generate ./... && /usr/local/go/bin/go build -ldflags '-linkmode external -extldflags "-static"' .
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
ENV OLLAMA_HOST 0.0.0.0
ARG USER=ollama
ARG GROUP=ollama
RUN groupadd $GROUP && useradd -m -g $GROUP $USER
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
USER $USER:$GROUP
ENTRYPOINT ["/bin/ollama"]
CMD ["serve"]

View file

@ -291,7 +291,7 @@ func DefaultOptions() Options {
NumCtx: 2048, NumCtx: 2048,
NumKeep: -1, NumKeep: -1,
NumBatch: 512, NumBatch: 512,
NumGPU: 1, NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
NumGQA: 1, NumGQA: 1,
LowVRAM: false, LowVRAM: false,
F16KV: true, F16KV: true,

View file

@ -6,6 +6,10 @@
Install required tools: Install required tools:
- cmake version 3.24 or higher
- go version 1.20 or higher
- gcc version 11.4.0 or higher
``` ```
brew install go cmake gcc brew install go cmake gcc
``` ```
@ -27,3 +31,9 @@ Now you can run `ollama`:
``` ```
./ollama ./ollama
``` ```
## Building on Linux with GPU support
- Install cmake and nvidia-cuda-toolkit
- run `go generate ./...`
- run `go build .`

View file

@ -4,12 +4,14 @@
package llm package llm
//go:generate git submodule init //go:generate git submodule init
//go:generate git submodule update --force ggml gguf
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch //go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch //go:generate -command git-apply git -C ggml apply
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch //go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch //go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on //go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release //go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/cpu --target server --config Release //go:generate cmake --build gguf/build/cpu --target server --config Release

View file

@ -1,12 +1,16 @@
package llm package llm
//go:generate git submodule init //go:generate git submodule init
//go:generate git submodule update --force ggml gguf
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch //go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch //go:generate -command git-apply git -C ggml apply
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch //go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch //go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 //go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/cpu --target server --config Release //go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate cmake --fresh -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate git submodule update --force gguf
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build gguf/build/cpu --target server --config Release //go:generate cmake --build gguf/build/cpu --target server --config Release

View file

@ -1,12 +1,16 @@
package llm package llm
//go:generate git submodule init //go:generate git submodule init
//go:generate git submodule update --force ggml gguf
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch //go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch //go:generate -command git-apply git -C ggml apply
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch //go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch //go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate cmake --fresh -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 //go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/gpu --target server --config Release //go:generate cmake --build ggml/build/gpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 //go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build gguf/build/gpu --target server --config Release //go:generate cmake --build gguf/build/gpu --target server --config Release

View file

@ -0,0 +1,14 @@
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate -command git-apply git -C ggml apply
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/gpu --target server --config Release
//go:generate git submodule update --force gguf
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build gguf/build/gpu --target server --config Release

View file

@ -58,6 +58,12 @@ func chooseRunner(gpuPath, cpuPath string) string {
if llamaPath == osPath(gpuPath) { if llamaPath == osPath(gpuPath) {
files = append(files, "ggml-metal.metal") files = append(files, "ggml-metal.metal")
} }
case "linux":
// check if there is a GPU available
if _, err := CheckVRAM(); errors.Is(err, errNoGPU) {
// this error was logged on start-up, so we don't need to log it again
llamaPath = osPath(cpuPath)
}
} }
for _, f := range files { for _, f := range files {
@ -218,6 +224,72 @@ type llama struct {
Running Running
} }
var errNoGPU = errors.New("nvidia-smi command failed")
// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
func CheckVRAM() (int, error) {
cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
var stdout bytes.Buffer
cmd.Stdout = &stdout
err := cmd.Run()
if err != nil {
return 0, errNoGPU
}
var total int
scanner := bufio.NewScanner(&stdout)
for scanner.Scan() {
line := scanner.Text()
vram, err := strconv.Atoi(line)
if err != nil {
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
}
total += vram
}
return total, nil
}
func NumGPU(opts api.Options) int {
if opts.NumGPU != -1 {
return opts.NumGPU
}
n := 1 // default to enable metal on macOS
if runtime.GOOS == "linux" {
vram, err := CheckVRAM()
if err != nil {
if err.Error() != "nvidia-smi command failed" {
log.Print(err.Error())
}
// nvidia driver not installed or no nvidia GPU found
return 0
}
// TODO: this is a very rough heuristic, better would be to calculate this based on number of layers and context size
switch {
case vram < 500:
log.Printf("WARNING: Low VRAM detected, disabling GPU")
n = 0
case vram < 1000:
n = 4
case vram < 2000:
n = 8
case vram < 4000:
n = 12
case vram < 8000:
n = 16
case vram < 12000:
n = 24
case vram < 16000:
n = 32
default:
n = 48
}
log.Printf("%d MB VRAM available, loading %d GPU layers", vram, n)
}
return n
}
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) { func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
if _, err := os.Stat(model); err != nil { if _, err := os.Stat(model); err != nil {
return nil, err return nil, err
@ -237,7 +309,7 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase), "--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale), "--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
"--batch-size", fmt.Sprintf("%d", opts.NumBatch), "--batch-size", fmt.Sprintf("%d", opts.NumBatch),
"--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU), "--n-gpu-layers", fmt.Sprintf("%d", NumGPU(opts)),
"--embedding", "--embedding",
} }
@ -305,7 +377,7 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
func waitForServer(llm *llama) error { func waitForServer(llm *llama) error {
// wait for the server to start responding // wait for the server to start responding
start := time.Now() start := time.Now()
expiresAt := time.Now().Add(30 * time.Second) expiresAt := time.Now().Add(45 * time.Second)
ticker := time.NewTicker(200 * time.Millisecond) ticker := time.NewTicker(200 * time.Millisecond)
log.Print("waiting for llama.cpp server to start responding") log.Print("waiting for llama.cpp server to start responding")

View file

@ -12,6 +12,7 @@ import (
"os/signal" "os/signal"
"path/filepath" "path/filepath"
"reflect" "reflect"
"runtime"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@ -548,6 +549,13 @@ func Serve(ln net.Listener, origins []string) error {
os.Exit(0) os.Exit(0)
}() }()
if runtime.GOOS == "linux" {
// check compatibility to log warnings
if _, err := llm.CheckVRAM(); err != nil {
log.Printf("Warning: GPU support not enabled, you may need to install GPU drivers: %v", err)
}
}
return s.Serve(ln) return s.Serve(ln)
} }