Add cuda v12 variant and selection logic
Based on compute capability and driver version, pick v12 or v11 cuda variants.
This commit is contained in:
parent
fc3b4cda89
commit
4fe3a556fa
4 changed files with 84 additions and 48 deletions
43
Dockerfile
43
Dockerfile
|
@ -1,7 +1,7 @@
|
||||||
ARG GOLANG_VERSION=1.22.5
|
ARG GOLANG_VERSION=1.22.5
|
||||||
ARG CMAKE_VERSION=3.22.1
|
ARG CMAKE_VERSION=3.22.1
|
||||||
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
|
ARG CUDA_VERSION_11=11.3.1
|
||||||
ARG CUDA_VERSION=11.3.1
|
ARG CUDA_VERSION_12=12.4.0
|
||||||
ARG ROCM_VERSION=6.1.2
|
ARG ROCM_VERSION=6.1.2
|
||||||
ARG JETPACK_6=r36.2.0
|
ARG JETPACK_6=r36.2.0
|
||||||
ARG JETPACK_5=r35.4.1
|
ARG JETPACK_5=r35.4.1
|
||||||
|
@ -13,7 +13,7 @@ COPY .git .git
|
||||||
COPY .gitmodules .gitmodules
|
COPY .gitmodules .gitmodules
|
||||||
COPY llm llm
|
COPY llm llm
|
||||||
|
|
||||||
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
|
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
COPY ./scripts/rh_linux_deps.sh /
|
COPY ./scripts/rh_linux_deps.sh /
|
||||||
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
||||||
|
@ -23,9 +23,29 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
ENV GOARCH amd64
|
ENV GOARCH amd64
|
||||||
RUN --mount=type=cache,target=/root/.ccache \
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
|
OLLAMA_SKIP_STATIC_GENERATE=1 \
|
||||||
|
OLLAMA_SKIP_CPU_GENERATE=1 \
|
||||||
|
CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \
|
||||||
|
CUDA_VARIANT="_v11" \
|
||||||
|
bash gen_linux.sh
|
||||||
|
|
||||||
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64
|
FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
|
||||||
|
ARG CMAKE_VERSION
|
||||||
|
COPY ./scripts/rh_linux_deps.sh /
|
||||||
|
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
||||||
|
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
|
||||||
|
COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
||||||
|
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
|
ARG CGO_CFLAGS
|
||||||
|
ENV GOARCH amd64
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
OLLAMA_SKIP_STATIC_GENERATE=1 \
|
||||||
|
OLLAMA_SKIP_CPU_GENERATE=1 \
|
||||||
|
CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \
|
||||||
|
CUDA_VARIANT="_v12" \
|
||||||
|
bash gen_linux.sh
|
||||||
|
|
||||||
|
FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
COPY ./scripts/rh_linux_deps.sh /
|
COPY ./scripts/rh_linux_deps.sh /
|
||||||
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
|
||||||
|
@ -34,7 +54,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
|
||||||
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
|
||||||
ARG CGO_CFLAGS
|
ARG CGO_CFLAGS
|
||||||
ENV GOARCH arm64
|
ENV GOARCH arm64
|
||||||
RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
|
||||||
|
|
||||||
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
|
FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
|
||||||
ARG CMAKE_VERSION
|
ARG CMAKE_VERSION
|
||||||
|
@ -139,8 +160,10 @@ COPY . .
|
||||||
COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
||||||
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
||||||
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
||||||
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
|
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
||||||
|
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
|
COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
||||||
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
|
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
||||||
ARG GOFLAGS
|
ARG GOFLAGS
|
||||||
|
@ -155,8 +178,8 @@ ARG GOLANG_VERSION
|
||||||
WORKDIR /go/src/github.com/ollama/ollama
|
WORKDIR /go/src/github.com/ollama/ollama
|
||||||
COPY . .
|
COPY . .
|
||||||
COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
||||||
COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
||||||
## arm binary += 381M
|
## arm binary += 381M
|
||||||
COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
|
||||||
COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
|
||||||
|
|
|
@ -4,9 +4,17 @@ package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"regexp"
|
||||||
|
"runtime"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
||||||
|
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||||
|
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||||
|
|
||||||
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||||
ids := []string{}
|
ids := []string{}
|
||||||
for _, info := range gpuInfo {
|
for _, info := range gpuInfo {
|
||||||
|
@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||||
}
|
}
|
||||||
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
|
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func cudaGetVariant(gpuInfo CudaGPUInfo) string {
|
||||||
|
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
|
||||||
|
if CudaTegra != "" {
|
||||||
|
ver := strings.Split(CudaTegra, ".")
|
||||||
|
if len(ver) > 0 {
|
||||||
|
return "jetpack" + ver[0]
|
||||||
|
}
|
||||||
|
} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
|
||||||
|
r := regexp.MustCompile(` R(\d+) `)
|
||||||
|
m := r.FindSubmatch(data)
|
||||||
|
if len(m) != 2 {
|
||||||
|
slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version")
|
||||||
|
} else {
|
||||||
|
if l4t, err := strconv.Atoi(string(m[1])); err == nil {
|
||||||
|
// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
|
||||||
|
// https://developer.nvidia.com/embedded/jetpack-archive
|
||||||
|
switch l4t {
|
||||||
|
case 35:
|
||||||
|
return "jetpack5"
|
||||||
|
case 36:
|
||||||
|
return "jetpack6"
|
||||||
|
default:
|
||||||
|
slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
|
||||||
|
return "v11"
|
||||||
|
}
|
||||||
|
return "v12"
|
||||||
|
}
|
||||||
|
|
40
gpu/gpu.go
40
gpu/gpu.go
|
@ -15,9 +15,7 @@ import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
|
||||||
"runtime"
|
"runtime"
|
||||||
"strconv"
|
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
@ -66,10 +64,6 @@ var RocmComputeMin = 9
|
||||||
// TODO find a better way to detect iGPU instead of minimum memory
|
// TODO find a better way to detect iGPU instead of minimum memory
|
||||||
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
||||||
|
|
||||||
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
|
||||||
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
|
||||||
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
|
||||||
|
|
||||||
// Note: gpuMutex must already be held
|
// Note: gpuMutex must already be held
|
||||||
func initCudaHandles() *cudaHandles {
|
func initCudaHandles() *cudaHandles {
|
||||||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
||||||
|
@ -233,35 +227,6 @@ func GetGPUInfo() GpuInfoList {
|
||||||
|
|
||||||
depPath := GetDepDir()
|
depPath := GetDepDir()
|
||||||
|
|
||||||
var cudaVariant string
|
|
||||||
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
|
|
||||||
if CudaTegra != "" {
|
|
||||||
ver := strings.Split(CudaTegra, ".")
|
|
||||||
if len(ver) > 0 {
|
|
||||||
cudaVariant = "jetpack" + ver[0]
|
|
||||||
}
|
|
||||||
} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
|
|
||||||
r := regexp.MustCompile(` R(\d+) `)
|
|
||||||
m := r.FindSubmatch(data)
|
|
||||||
if len(m) != 2 {
|
|
||||||
slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version")
|
|
||||||
} else {
|
|
||||||
if l4t, err := strconv.Atoi(string(m[1])); err == nil {
|
|
||||||
// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
|
|
||||||
// https://developer.nvidia.com/embedded/jetpack-archive
|
|
||||||
switch l4t {
|
|
||||||
case 35:
|
|
||||||
cudaVariant = "jetpack5"
|
|
||||||
case 36:
|
|
||||||
cudaVariant = "jetpack6"
|
|
||||||
default:
|
|
||||||
slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Load ALL libraries
|
// Load ALL libraries
|
||||||
cHandles = initCudaHandles()
|
cHandles = initCudaHandles()
|
||||||
|
|
||||||
|
@ -271,7 +236,6 @@ func GetGPUInfo() GpuInfoList {
|
||||||
gpuInfo := CudaGPUInfo{
|
gpuInfo := CudaGPUInfo{
|
||||||
GpuInfo: GpuInfo{
|
GpuInfo: GpuInfo{
|
||||||
Library: "cuda",
|
Library: "cuda",
|
||||||
Variant: cudaVariant,
|
|
||||||
},
|
},
|
||||||
index: i,
|
index: i,
|
||||||
}
|
}
|
||||||
|
@ -297,7 +261,10 @@ func GetGPUInfo() GpuInfoList {
|
||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
gpuInfo.FreeMemory = uint64(memInfo.free)
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
||||||
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
|
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
|
||||||
|
gpuInfo.computeMajor = int(memInfo.major)
|
||||||
|
gpuInfo.computeMinor = int(memInfo.minor)
|
||||||
gpuInfo.MinimumMemory = cudaMinimumMemory
|
gpuInfo.MinimumMemory = cudaMinimumMemory
|
||||||
|
cudaVariant := cudaGetVariant(gpuInfo)
|
||||||
if depPath != "" {
|
if depPath != "" {
|
||||||
gpuInfo.DependencyPath = depPath
|
gpuInfo.DependencyPath = depPath
|
||||||
// Check for variant specific directory
|
// Check for variant specific directory
|
||||||
|
@ -310,6 +277,7 @@ func GetGPUInfo() GpuInfoList {
|
||||||
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
||||||
gpuInfo.DriverMajor = driverMajor
|
gpuInfo.DriverMajor = driverMajor
|
||||||
gpuInfo.DriverMinor = driverMinor
|
gpuInfo.DriverMinor = driverMinor
|
||||||
|
gpuInfo.Variant = cudaGetVariant(gpuInfo)
|
||||||
|
|
||||||
// query the management library as well so we can record any skew between the two
|
// query the management library as well so we can record any skew between the two
|
||||||
// which represents overhead on the GPU we must set aside on subsequent updates
|
// which represents overhead on the GPU we must set aside on subsequent updates
|
||||||
|
|
|
@ -53,8 +53,10 @@ type CPUInfo struct {
|
||||||
|
|
||||||
type CudaGPUInfo struct {
|
type CudaGPUInfo struct {
|
||||||
GpuInfo
|
GpuInfo
|
||||||
OSOverhead uint64 // Memory overhead between the driver library and management library
|
OSOverhead uint64 // Memory overhead between the driver library and management library
|
||||||
index int //nolint:unused,nolintlint
|
index int //nolint:unused,nolintlint
|
||||||
|
computeMajor int //nolint:unused,nolintlint
|
||||||
|
computeMinor int //nolint:unused,nolintlint
|
||||||
}
|
}
|
||||||
type CudaGPUInfoList []CudaGPUInfo
|
type CudaGPUInfoList []CudaGPUInfo
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue