diff --git a/Dockerfile b/Dockerfile index c50665b6..7c882852 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,8 +11,8 @@ RUN mkdir -p /usr/local && tar xz -C /usr/local totalBiggestCard { + totalBiggestCard = possible + bigCardName = record[0] + } + } + if totalBiggestCard == 0 { + log.Printf("found ROCm GPU but failed to parse free VRAM!") + return 0, errNoAccel + } + log.Printf("ROCm selecting device %q", bigCardName) + return totalBiggestCard, nil +} diff --git a/llm/llama.cpp/generate_linux_cuda.go b/llm/llama.cpp/generate_linux_cuda.go new file mode 100644 index 00000000..86a95977 --- /dev/null +++ b/llm/llama.cpp/generate_linux_cuda.go @@ -0,0 +1,24 @@ +//go:build cuda + +package llm + +//go:generate git submodule init + +//go:generate git submodule update --force ggml +//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch +//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch +//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch +//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch + +//go:generate rm -rf ggml/build/cuda +//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on +//go:generate cmake --build ggml/build/cuda --target server --config Release +//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner + +//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch +//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch + +//go:generate rm -rf gguf/build/cuda +//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off +//go:generate cmake --build gguf/build/cuda --target server --config Release +//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner diff --git a/llm/llama.cpp/generate_linux_rocm.go b/llm/llama.cpp/generate_linux_rocm.go new file mode 100644 index 00000000..1766be84 --- /dev/null +++ b/llm/llama.cpp/generate_linux_rocm.go @@ -0,0 +1,25 @@ +//go:build rocm + +package llm + +//go:generate git submodule init + +//go:generate git submodule update --force ggml +//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch +//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch +//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch +//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch + +//go:generate git submodule update --force gguf +//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch +//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch + +//go:generate rm -rf ggml/build/rocm +//go:generate cmake -S ggml -B ggml/build/rocm -DLLAMA_CLBLAST=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on +//go:generate cmake --build ggml/build/rocm --target server --config Release +//go:generate mv ggml/build/rocm/bin/server ggml/build/rocm/bin/ollama-runner + +//go:generate rm -rf gguf/build/rocm +//go:generate cmake -S gguf -B gguf/build/rocm -DLLAMA_HIPBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' +//go:generate cmake --build gguf/build/rocm --target server --config Release +//go:generate mv gguf/build/rocm/bin/server gguf/build/rocm/bin/ollama-runner