Add cuda v12 variant and selection logic

Based on compute capability and driver version, pick v12 or v11 cuda variants.
2024-06-13 20:46:14 -07:00 · 2024-06-13 20:46:14 -07:00 · 4fe3a556fa
commit 4fe3a556fa
parent fc3b4cda89
4 changed files with 84 additions and 48 deletions
--- a/43
+++ b/43
@ -1,7 +1,7 @@
 ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
-# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
+ARG CUDA_VERSION_11=11.3.1
-ARG CUDA_VERSION=11.3.1
+ARG CUDA_VERSION_12=12.4.0
 ARG ROCM_VERSION=6.1.2
 ARG JETPACK_6=r36.2.0
 ARG JETPACK_5=r35.4.1
@ -13,7 +13,7 @@ COPY .git .git
 COPY .gitmodules .gitmodules
 COPY llm llm
-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@ -23,9 +23,29 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
    CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \
    CUDA_VARIANT="_v11" \
    bash gen_linux.sh
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64
+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
    CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \
    CUDA_VARIANT="_v12" \
    bash gen_linux.sh
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@ -34,7 +54,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH arm64 
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
 ARG CMAKE_VERSION
@ -139,8 +160,10 @@ COPY . .
 COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
@ -155,8 +178,8 @@ ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ## arm binary += 381M 
 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
--- a/gpu/cuda_common.go
+++ b/gpu/cuda_common.go
@ -4,9 +4,17 @@ package gpu
 import (
 	"log/slog"
 	"os"
 	"regexp"
 	"runtime"
 	"strconv"
 	"strings"
 )
 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
 func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	}
 	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
 func cudaGetVariant(gpuInfo CudaGPUInfo) string {
 	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
 		if CudaTegra != "" {
 			ver := strings.Split(CudaTegra, ".")
 			if len(ver) > 0 {
 				return "jetpack" + ver[0]
 			}
 		} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
 			r := regexp.MustCompile(` R(\d+) `)
 			m := r.FindSubmatch(data)
 			if len(m) != 2 {
 				slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
 			} else {
 				if l4t, err := strconv.Atoi(string(m[1])); err == nil {
 					// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
 					// https://developer.nvidia.com/embedded/jetpack-archive
 					switch l4t {
 					case 35:
 						return "jetpack5"
 					case 36:
 						return "jetpack6"
 					default:
 						slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
 					}
 				}
 			}
 		}
 	}
 	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
 		return "v11"
 	}
 	return "v12"
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -15,9 +15,7 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
 	"regexp"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
@ -66,10 +64,6 @@ var RocmComputeMin = 9
 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
 // Note: gpuMutex must already be held
 func initCudaHandles() *cudaHandles {
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
@ -233,35 +227,6 @@ func GetGPUInfo() GpuInfoList {
 		depPath := GetDepDir()
 		var cudaVariant string
 		if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
 			if CudaTegra != "" {
 				ver := strings.Split(CudaTegra, ".")
 				if len(ver) > 0 {
 					cudaVariant = "jetpack" + ver[0]
 				}
 			} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
 				r := regexp.MustCompile(` R(\d+) `)
 				m := r.FindSubmatch(data)
 				if len(m) != 2 {
 					slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
 				} else {
 					if l4t, err := strconv.Atoi(string(m[1])); err == nil {
 						// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
 						// https://developer.nvidia.com/embedded/jetpack-archive
 						switch l4t {
 						case 35:
 							cudaVariant = "jetpack5"
 						case 36:
 							cudaVariant = "jetpack6"
 						default:
 							slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
 						}
 					}
 				}
 			}
 		}
 		// Load ALL libraries
 		cHandles = initCudaHandles()
@ -271,7 +236,6 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo := CudaGPUInfo{
 					GpuInfo: GpuInfo{
 						Library: "cuda",
 						Variant: cudaVariant,
 					},
 					index: i,
 				}
@ -297,7 +261,10 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.FreeMemory = uint64(memInfo.free)
 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
 				gpuInfo.computeMajor = int(memInfo.major)
 				gpuInfo.computeMinor = int(memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
 				cudaVariant := cudaGetVariant(gpuInfo)
 				if depPath != "" {
 					gpuInfo.DependencyPath = depPath
 					// Check for variant specific directory
@ -310,6 +277,7 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
 				gpuInfo.Variant = cudaGetVariant(gpuInfo)
 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
--- a/gpu/types.go
+++ b/gpu/types.go
@ -53,8 +53,10 @@ type CPUInfo struct {
 type CudaGPUInfo struct {
 	GpuInfo
-	OSOverhead uint64 // Memory overhead between the driver library and management library
+	OSOverhead   uint64 // Memory overhead between the driver library and management library
-	index      int    //nolint:unused,nolintlint
+	index        int    //nolint:unused,nolintlint
 	computeMajor int    //nolint:unused,nolintlint
 	computeMinor int    //nolint:unused,nolintlint
 }
 type CudaGPUInfoList []CudaGPUInfo