From f8ef4439e9673c7df2314fafb5975aeab856c51f Mon Sep 17 00:00:00 2001 From: 65a <65a.invalid> Date: Mon, 16 Oct 2023 17:41:40 -0700 Subject: [PATCH] Use build tags to generate accelerated binaries for CUDA and ROCm on Linux. The build tags rocm or cuda must be specified to both go generate and go build. ROCm builds should have both ROCM_PATH set (and the ROCM SDK present) as well as CLBlast installed (for GGML) and CLBlast_DIR set in the environment to the CLBlast cmake directory (likely /usr/lib/cmake/CLBlast). Build tags are also used to switch VRAM detection between cuda and rocm implementations, using added "accelerator_foo.go" files which contain architecture specific functions and variables. accelerator_none is used when no tags are set, and a helper function addRunner will ignore it if it is the chosen accelerator. Fix go generate commands, thanks @deadmeu for testing. --- Dockerfile | 6 +- Dockerfile.build | 4 +- README.md | 35 +++++++++++- llm/accelerator_cuda.go | 67 ++++++++++++++++++++++ llm/accelerator_none.go | 21 +++++++ llm/accelerator_rocm.go | 85 ++++++++++++++++++++++++++++ llm/llama.cpp/generate_linux_cuda.go | 24 ++++++++ llm/llama.cpp/generate_linux_rocm.go | 25 ++++++++ 8 files changed, 261 insertions(+), 6 deletions(-) create mode 100644 llm/accelerator_cuda.go create mode 100644 llm/accelerator_none.go create mode 100644 llm/accelerator_rocm.go create mode 100644 llm/llama.cpp/generate_linux_cuda.go create mode 100644 llm/llama.cpp/generate_linux_rocm.go diff --git a/Dockerfile b/Dockerfile index c50665b6..7c882852 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,8 +11,8 @@ RUN mkdir -p /usr/local && tar xz -C /usr/local totalBiggestCard { + totalBiggestCard = possible + bigCardName = record[0] + } + } + if totalBiggestCard == 0 { + log.Printf("found ROCm GPU but failed to parse free VRAM!") + return 0, errNoAccel + } + log.Printf("ROCm selecting device %q", bigCardName) + return totalBiggestCard, nil +} diff --git a/llm/llama.cpp/generate_linux_cuda.go b/llm/llama.cpp/generate_linux_cuda.go new file mode 100644 index 00000000..86a95977 --- /dev/null +++ b/llm/llama.cpp/generate_linux_cuda.go @@ -0,0 +1,24 @@ +//go:build cuda + +package llm + +//go:generate git submodule init + +//go:generate git submodule update --force ggml +//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch +//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch +//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch +//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch + +//go:generate rm -rf ggml/build/cuda +//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on +//go:generate cmake --build ggml/build/cuda --target server --config Release +//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner + +//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch +//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch + +//go:generate rm -rf gguf/build/cuda +//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off +//go:generate cmake --build gguf/build/cuda --target server --config Release +//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner diff --git a/llm/llama.cpp/generate_linux_rocm.go b/llm/llama.cpp/generate_linux_rocm.go new file mode 100644 index 00000000..1766be84 --- /dev/null +++ b/llm/llama.cpp/generate_linux_rocm.go @@ -0,0 +1,25 @@ +//go:build rocm + +package llm + +//go:generate git submodule init + +//go:generate git submodule update --force ggml +//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch +//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch +//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch +//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch + +//go:generate git submodule update --force gguf +//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch +//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch + +//go:generate rm -rf ggml/build/rocm +//go:generate cmake -S ggml -B ggml/build/rocm -DLLAMA_CLBLAST=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on +//go:generate cmake --build ggml/build/rocm --target server --config Release +//go:generate mv ggml/build/rocm/bin/server ggml/build/rocm/bin/ollama-runner + +//go:generate rm -rf gguf/build/rocm +//go:generate cmake -S gguf -B gguf/build/rocm -DLLAMA_HIPBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' +//go:generate cmake --build gguf/build/rocm --target server --config Release +//go:generate mv gguf/build/rocm/bin/server gguf/build/rocm/bin/ollama-runner