Make CPU builds parallel and customizable AMD GPUs

The linux build now support parallel CPU builds to speed things up. This also exposes AMD GPU targets as an optional setting for advaced users who want to alter our default set.
2024-01-21 12:57:13 -08:00 · 2024-01-21 12:57:13 -08:00 · df54c723ae
commit df54c723ae
parent fa8c990e58
4 changed files with 68 additions and 44 deletions
--- a/41
+++ b/41
@ -10,91 +10,102 @@ COPY llm llm
 FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
 ARG CMAKE_VERSION
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
 ARG CMAKE_VERSION
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
 ARG CMAKE_VERSION
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV LIBRARY_PATH /opt/amdgpu/lib64
 COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
 ARG CMAKE_VERSION
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV LIBRARY_PATH /opt/amdgpu/lib64
 COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
-FROM --platform=linux/amd64 centos:7 AS cpu-build-amd64
+FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
-RUN sh gen_linux.sh
+
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
 RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
 RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
 RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
 FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
-RUN sh gen_linux.sh
+# Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
 RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
 ENV CGO_ENABLED 1
 ARG GOFLAGS
 ARG CGO_CFLAGS
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN go build .
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
 ENV CGO_ENABLED 1
 ARG GOLANG_VERSION
 ARG GOFLAGS
 ARG CGO_CFLAGS
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
 COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN go build .
 # Runtime stages
--- a/docs/development.md
+++ b/docs/development.md
@ -74,7 +74,8 @@ Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `ROCM_PATH` to the location of the ROCm
 install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
-CLBlast install (typically `/usr/lib/cmake/CLBlast`).
+CLBlast install (typically `/usr/lib/cmake/CLBlast`).  You can also customize
 the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
 ```
 go generate ./...
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -16,6 +16,10 @@ set -o pipefail
 # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
 amdGPUs() {
    if [ -n "${AMDGPU_TARGETS}" ]; then
        echo "${AMDGPU_TARGETS}"
        return
    fi
    GPU_LIST=(
        "gfx803"
        "gfx900"
@ -73,36 +77,42 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
-        #
+        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
-        # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
+            #
-        #
+            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+            #
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
+            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-        echo "Building LCD CPU"
+            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
-        build
+            echo "Building LCD CPU"
-        compress_libs
+            build
            compress_libs
        fi
-        #
+        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
-        # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
+            #
-        # Approximately 400% faster than LCD on same CPU
+            # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
-        #
+            # Approximately 400% faster than LCD on same CPU
-        init_vars
+            #
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+            init_vars
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
+            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-        echo "Building AVX CPU"
+            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
-        build
+            echo "Building AVX CPU"
-        compress_libs
+            build
            compress_libs
        fi
-        #
+        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
-        # ~2013 CPU Dynamic library
+            #
-        # Approximately 10% faster than AVX on same CPU
+            # ~2013 CPU Dynamic library
-        #
+            # Approximately 10% faster than AVX on same CPU
-        init_vars
+            #
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+            init_vars
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
+            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-        echo "Building AVX2 CPU"
+            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
-        build
+            echo "Building AVX2 CPU"
-        compress_libs
+            build
            compress_libs
        fi
    fi
 else
    echo "Skipping CPU generation step as requested"
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@ -6,6 +6,7 @@ export VERSION=${VERSION:-0.0.0}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
 BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
 mkdir -p dist
 for TARGETARCH in ${BUILD_ARCH}; do
@ -14,6 +15,7 @@ for TARGETARCH in ${BUILD_ARCH}; do
        --build-arg=GOFLAGS \
        --build-arg=CGO_CFLAGS \
        --build-arg=OLLAMA_CUSTOM_CPU_DEFS \
        --build-arg=AMDGPU_TARGETS \
        --target build-$TARGETARCH \
        -f Dockerfile \
        -t builder:$TARGETARCH \