first pass at linux gpu support (#454)
* linux gpu support * handle multiple gpus * add cuda docker image (#488) --------- Co-authored-by: Michael Yang <mxyng@pm.me>
This commit is contained in:
parent
45ac07cd02
commit
f221637053
9 changed files with 158 additions and 22 deletions
22
Dockerfile.cuda
Normal file
22
Dockerfile.cuda
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||||
|
|
||||||
|
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||||
|
RUN apt-get update && apt-get install -y git build-essential cmake
|
||||||
|
ADD https://dl.google.com/go/go1.21.1.linux-amd64.tar.gz /tmp/go1.21.1.tar.gz
|
||||||
|
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.1.tar.gz
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
RUN /usr/local/go/bin/go generate ./... && /usr/local/go/bin/go build -ldflags '-linkmode external -extldflags "-static"' .
|
||||||
|
|
||||||
|
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
|
||||||
|
ENV OLLAMA_HOST 0.0.0.0
|
||||||
|
|
||||||
|
ARG USER=ollama
|
||||||
|
ARG GROUP=ollama
|
||||||
|
RUN groupadd $GROUP && useradd -m -g $GROUP $USER
|
||||||
|
|
||||||
|
COPY --from=0 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
|
||||||
|
|
||||||
|
USER $USER:$GROUP
|
||||||
|
ENTRYPOINT ["/bin/ollama"]
|
||||||
|
CMD ["serve"]
|
|
@ -291,7 +291,7 @@ func DefaultOptions() Options {
|
||||||
NumCtx: 2048,
|
NumCtx: 2048,
|
||||||
NumKeep: -1,
|
NumKeep: -1,
|
||||||
NumBatch: 512,
|
NumBatch: 512,
|
||||||
NumGPU: 1,
|
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||||
NumGQA: 1,
|
NumGQA: 1,
|
||||||
LowVRAM: false,
|
LowVRAM: false,
|
||||||
F16KV: true,
|
F16KV: true,
|
||||||
|
|
|
@ -6,6 +6,10 @@
|
||||||
|
|
||||||
Install required tools:
|
Install required tools:
|
||||||
|
|
||||||
|
- cmake version 3.24 or higher
|
||||||
|
- go version 1.20 or higher
|
||||||
|
- gcc version 11.4.0 or higher
|
||||||
|
|
||||||
```
|
```
|
||||||
brew install go cmake gcc
|
brew install go cmake gcc
|
||||||
```
|
```
|
||||||
|
@ -27,3 +31,9 @@ Now you can run `ollama`:
|
||||||
```
|
```
|
||||||
./ollama
|
./ollama
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Building on Linux with GPU support
|
||||||
|
|
||||||
|
- Install cmake and nvidia-cuda-toolkit
|
||||||
|
- run `go generate ./...`
|
||||||
|
- run `go build .`
|
||||||
|
|
|
@ -4,12 +4,14 @@
|
||||||
package llm
|
package llm
|
||||||
|
|
||||||
//go:generate git submodule init
|
//go:generate git submodule init
|
||||||
//go:generate git submodule update --force ggml gguf
|
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
//go:generate git submodule update --force ggml
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
//go:generate -command git-apply git -C ggml apply
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||||
|
|
||||||
|
//go:generate git submodule update --force gguf
|
||||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
|
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
|
||||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||||
|
|
|
@ -1,12 +1,16 @@
|
||||||
package llm
|
package llm
|
||||||
|
|
||||||
//go:generate git submodule init
|
//go:generate git submodule init
|
||||||
//go:generate git submodule update --force ggml gguf
|
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
//go:generate git submodule update --force ggml
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
//go:generate -command git-apply git -C ggml apply
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||||
|
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||||
//go:generate cmake --fresh -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
|
||||||
|
//go:generate git submodule update --force gguf
|
||||||
|
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||||
|
|
|
@ -1,12 +1,16 @@
|
||||||
package llm
|
package llm
|
||||||
|
|
||||||
//go:generate git submodule init
|
//go:generate git submodule init
|
||||||
//go:generate git submodule update --force ggml gguf
|
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
//go:generate git submodule update --force ggml
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
//go:generate -command git-apply git -C ggml apply
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
//go:generate cmake --fresh -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||||
|
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
||||||
|
|
||||||
|
//go:generate git submodule update --force gguf
|
||||||
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
||||||
|
|
14
llm/llama.cpp/generate_linux.go
Normal file
14
llm/llama.cpp/generate_linux.go
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
package llm
|
||||||
|
|
||||||
|
//go:generate git submodule init
|
||||||
|
|
||||||
|
//go:generate git submodule update --force ggml
|
||||||
|
//go:generate -command git-apply git -C ggml apply
|
||||||
|
//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
||||||
|
//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
|
||||||
|
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||||
|
//go:generate cmake --build ggml/build/gpu --target server --config Release
|
||||||
|
|
||||||
|
//go:generate git submodule update --force gguf
|
||||||
|
//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||||
|
//go:generate cmake --build gguf/build/gpu --target server --config Release
|
76
llm/llama.go
76
llm/llama.go
|
@ -58,6 +58,12 @@ func chooseRunner(gpuPath, cpuPath string) string {
|
||||||
if llamaPath == osPath(gpuPath) {
|
if llamaPath == osPath(gpuPath) {
|
||||||
files = append(files, "ggml-metal.metal")
|
files = append(files, "ggml-metal.metal")
|
||||||
}
|
}
|
||||||
|
case "linux":
|
||||||
|
// check if there is a GPU available
|
||||||
|
if _, err := CheckVRAM(); errors.Is(err, errNoGPU) {
|
||||||
|
// this error was logged on start-up, so we don't need to log it again
|
||||||
|
llamaPath = osPath(cpuPath)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, f := range files {
|
for _, f := range files {
|
||||||
|
@ -218,6 +224,72 @@ type llama struct {
|
||||||
Running
|
Running
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var errNoGPU = errors.New("nvidia-smi command failed")
|
||||||
|
|
||||||
|
// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
|
||||||
|
func CheckVRAM() (int, error) {
|
||||||
|
cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
|
||||||
|
var stdout bytes.Buffer
|
||||||
|
cmd.Stdout = &stdout
|
||||||
|
err := cmd.Run()
|
||||||
|
if err != nil {
|
||||||
|
return 0, errNoGPU
|
||||||
|
}
|
||||||
|
|
||||||
|
var total int
|
||||||
|
scanner := bufio.NewScanner(&stdout)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
vram, err := strconv.Atoi(line)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
total += vram
|
||||||
|
}
|
||||||
|
|
||||||
|
return total, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func NumGPU(opts api.Options) int {
|
||||||
|
if opts.NumGPU != -1 {
|
||||||
|
return opts.NumGPU
|
||||||
|
}
|
||||||
|
n := 1 // default to enable metal on macOS
|
||||||
|
if runtime.GOOS == "linux" {
|
||||||
|
vram, err := CheckVRAM()
|
||||||
|
if err != nil {
|
||||||
|
if err.Error() != "nvidia-smi command failed" {
|
||||||
|
log.Print(err.Error())
|
||||||
|
}
|
||||||
|
// nvidia driver not installed or no nvidia GPU found
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
// TODO: this is a very rough heuristic, better would be to calculate this based on number of layers and context size
|
||||||
|
switch {
|
||||||
|
case vram < 500:
|
||||||
|
log.Printf("WARNING: Low VRAM detected, disabling GPU")
|
||||||
|
n = 0
|
||||||
|
case vram < 1000:
|
||||||
|
n = 4
|
||||||
|
case vram < 2000:
|
||||||
|
n = 8
|
||||||
|
case vram < 4000:
|
||||||
|
n = 12
|
||||||
|
case vram < 8000:
|
||||||
|
n = 16
|
||||||
|
case vram < 12000:
|
||||||
|
n = 24
|
||||||
|
case vram < 16000:
|
||||||
|
n = 32
|
||||||
|
default:
|
||||||
|
n = 48
|
||||||
|
}
|
||||||
|
log.Printf("%d MB VRAM available, loading %d GPU layers", vram, n)
|
||||||
|
}
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
|
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
|
||||||
if _, err := os.Stat(model); err != nil {
|
if _, err := os.Stat(model); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -237,7 +309,7 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
||||||
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
||||||
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
||||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||||
"--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU),
|
"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(opts)),
|
||||||
"--embedding",
|
"--embedding",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -305,7 +377,7 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
||||||
func waitForServer(llm *llama) error {
|
func waitForServer(llm *llama) error {
|
||||||
// wait for the server to start responding
|
// wait for the server to start responding
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
expiresAt := time.Now().Add(30 * time.Second)
|
expiresAt := time.Now().Add(45 * time.Second)
|
||||||
ticker := time.NewTicker(200 * time.Millisecond)
|
ticker := time.NewTicker(200 * time.Millisecond)
|
||||||
|
|
||||||
log.Print("waiting for llama.cpp server to start responding")
|
log.Print("waiting for llama.cpp server to start responding")
|
||||||
|
|
|
@ -12,6 +12,7 @@ import (
|
||||||
"os/signal"
|
"os/signal"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"reflect"
|
"reflect"
|
||||||
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
@ -548,6 +549,13 @@ func Serve(ln net.Listener, origins []string) error {
|
||||||
os.Exit(0)
|
os.Exit(0)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if runtime.GOOS == "linux" {
|
||||||
|
// check compatibility to log warnings
|
||||||
|
if _, err := llm.CheckVRAM(); err != nil {
|
||||||
|
log.Printf("Warning: GPU support not enabled, you may need to install GPU drivers: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return s.Serve(ln)
|
return s.Serve(ln)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue