Merge pull request #2007 from dhiltgen/cpu_fallback

Add multiple CPU variants for Intel Mac
2024-01-18 11:32:29 -08:00 · 2024-01-18 11:32:29 -08:00 · df40b11d03
commit df40b11d03
parent d5a7353357 b992bf65fc
18 changed files with 321 additions and 186 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -79,13 +79,16 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
-        arch: [amd64, arm64]
+        arch: [amd64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
          - os: windows-latest
            arch: arm64
    runs-on: ${{ matrix.os }}
+    env:
+      GOARCH: ${{ matrix.arch }}
+      CGO_ENABLED: "1"
    steps:
      - uses: actions/checkout@v4
        with:
--- a/Dockerfile.build
+++ b/Dockerfile.build
@ -10,6 +10,7 @@ COPY llm llm

 FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
 ARG CMAKE_VERSION
+ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
@ -19,6 +20,7 @@ RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
 ARG CMAKE_VERSION
+ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
@ -28,6 +30,7 @@ RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

 FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
 ARG CMAKE_VERSION
+ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
@ -38,6 +41,7 @@ RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh

 FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
 ARG CMAKE_VERSION
+ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
@ -50,6 +54,7 @@ FROM --platform=linux/amd64 centos:7 AS cpu-build-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG OLLAMA_CUSTOM_CPU_DEFS
+ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
@ -61,6 +66,7 @@ FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG OLLAMA_CUSTOM_CPU_DEFS
+ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
@ -72,7 +78,7 @@ RUN sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
 ENV CGO_ENABLED 1
 ARG GOFLAGS
-ARG CGO_FLAGS
+ARG CGO_CFLAGS
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
 COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
@ -84,7 +90,7 @@ FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
 ENV CGO_ENABLED 1
 ARG GOLANG_VERSION
 ARG GOFLAGS
-ARG CGO_FLAGS
+ARG CGO_CFLAGS
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
 COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@ -5,7 +5,7 @@

 #ifdef __linux__
 #include <dlfcn.h>
-#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags | RTLD_DEEPBIND)
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
 #define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
 #define LOAD_ERR() strdup(dlerror())
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
@ -58,8 +58,8 @@ void dyn_init(const char *libPath, struct dynamic_llama_server *s,
      {"", NULL},
  };

-  printf("loading %s library\n", libPath);
-  s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
+  printf("loading library %s\n", libPath);
+  s->handle = LOAD_LIBRARY(libPath, RTLD_GLOBAL|RTLD_NOW);
  if (!s->handle) {
    err->id = -1;
    char *msg = LOAD_ERR();
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@ -372,15 +372,6 @@ func updatePath(dir string) {
 		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
 		log.Printf("Updating PATH to %s", newPath)
 		os.Setenv("PATH", newPath)
-	} else {
-		pathComponents := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
-		for _, comp := range pathComponents {
-			if comp == dir {
-				return
-			}
-		}
-		newPath := strings.Join(append([]string{dir}, pathComponents...), ":")
-		log.Printf("Updating LD_LIBRARY_PATH to %s", newPath)
-		os.Setenv("LD_LIBRARY_PATH", newPath)
 	}
+	// linux and darwin rely on rpath
 }
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@ -2,28 +2,24 @@

 set(TARGET ext_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp)
+if (WIN32)
+    add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp)
+else()
+    add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp)
+endif()
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_include_directories(${TARGET} PRIVATE ../..)
 target_include_directories(${TARGET} PRIVATE ../../..)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
-target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_definitions(${TARGET} PRIVATE
-    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
-)
-
-if (BUILD_SHARED_LIBS)
-    set_target_properties(ext_server PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(ext_server PRIVATE LLAMA_SHARED LLAMA_BUILD)
-    add_library(ext_server_shared SHARED $<TARGET_OBJECTS:ext_server>)
-    target_link_libraries(ext_server_shared PRIVATE ggml llama llava common ${CMAKE_THREAD_LIBS_INIT})
-    install(TARGETS ext_server_shared LIBRARY)
-endif()
+target_link_libraries(${TARGET} PRIVATE ggml llava common )
+set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
+install(TARGETS ext_server LIBRARY)

 if (CUDAToolkit_FOUND)
    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
    if (WIN32)
-        target_link_libraries(ext_server_shared PRIVATE nvml)
+        target_link_libraries(${TARGET} PRIVATE nvml)
    endif()
 endif()
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@ -1,15 +1,44 @@
 # common logic accross linux and darwin

 init_vars() {
+    case "${GOARCH}" in
+    "amd64")
+        ARCH="x86_64"
+        ;;
+    "arm64")
+        ARCH="arm64"
+        ;;
+    *)
+        ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
+    esac
+
    LLAMACPP_DIR=../llama.cpp
    CMAKE_DEFS=""
-    CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
+    CMAKE_TARGETS="--target ext_server"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on"
+        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
    else
        # TODO - add additional optimization flags...
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off"
+        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
    fi
+    case $(uname -s) in 
+    "Darwin")
+        LIB_EXT="dylib"
+        WHOLE_ARCHIVE="-Wl,-force_load"
+        NO_WHOLE_ARCHIVE=""
+        GCC_ARCH="-arch ${ARCH}"
+        ;;
+    "Linux")
+        LIB_EXT="so"
+        WHOLE_ARCHIVE="-Wl,--whole-archive"
+        NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
+
+        # Cross compiling not supported on linux - Use docker
+        GCC_ARCH=""
+        ;;
+    *)
+        ;;
+    esac
 }

 git_module_setup() {
@ -40,25 +69,29 @@ apply_patches() {
 build() {
    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
+    mkdir -p ${BUILD_DIR}/lib/
+    g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
+        ${GCC_ARCH} \
+        ${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \
+        ${BUILD_DIR}/common/libcommon.a \
+        ${BUILD_DIR}/libllama.a \
+        -Wl,-rpath,\$ORIGIN \
+        -lpthread -ldl -lm \
+        ${EXTRA_LIBS}
 }

-install() {
-    rm -rf ${BUILD_DIR}/lib
-    mkdir -p ${BUILD_DIR}/lib
-    cp ${BUILD_DIR}/examples/server/libext_server.a ${BUILD_DIR}/lib
-    cp ${BUILD_DIR}/common/libcommon.a ${BUILD_DIR}/lib
-    cp ${BUILD_DIR}/libllama.a ${BUILD_DIR}/lib
-    cp ${BUILD_DIR}/libggml_static.a ${BUILD_DIR}/lib
-}
-
-link_server_lib() {
-    gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
-        -Wl,--whole-archive \
-        ${BUILD_DIR}/lib/libext_server.a \
-        -Wl,--no-whole-archive \
-        ${BUILD_DIR}/lib/libcommon.a \
-        ${BUILD_DIR}/lib/libllama.a \
-        -lstdc++
+compress_libs() {
+    echo "Compressing payloads to reduce overall binary size..."
+    pids=""
+    for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
+        bzip2 -v9 ${lib} &
+        pids+=" $!"
+    done
+    echo 
+    for pid in ${pids}; do
+        wait $pid
+    done
+    echo "Finished compression"
 }

 # Keep the local tree clean after we're done with the build
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@ -9,16 +9,52 @@ set -o pipefail
 echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
-CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
-BUILD_DIR="${LLAMACPP_DIR}/build/darwin/metal"
+git_module_setup
+apply_patches
+
+COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_ACCELERATE=off"
+
 case "${GOARCH}" in
 "amd64")
-    CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    ARCH="x86_64"
+    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
+
+    #
+    # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
+    #
+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
+    echo "Building LCD CPU"
+    build
+    compress_libs
+
+    #
+    # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
+    # Approximately 400% faster than LCD on same CPU
+    #
+    init_vars
+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
+    echo "Building AVX CPU"
+    build
+    compress_libs
+
+    #
+    # ~2013 CPU Dynamic library
+    # Approximately 10% faster than AVX on same CPU
+    #
+    init_vars
+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
+    echo "Building AVX2 CPU"
+    build
+    compress_libs
    ;;
 "arm64")
-    CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DLLAMA_METAL=on ${CMAKE_DEFS}"
-    ARCH="arm64"
+    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
+    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
+    build
+    compress_libs
    ;;
 *)
    echo "GOARCH must be set"
@ -27,21 +63,4 @@ case "${GOARCH}" in
    ;;
 esac

-git_module_setup
-apply_patches
-build
-install
-gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
-    -arch ${ARCH} \
-    -Wl,-force_load ${BUILD_DIR}/lib/libext_server.a \
-    ${BUILD_DIR}/lib/libcommon.a \
-    ${BUILD_DIR}/lib/libllama.a \
-    ${BUILD_DIR}/lib/libggml_static.a \
-    -lpthread -ldl -lm -lc++ \
-    -framework Accelerate \
-    -framework Foundation \
-    -framework Metal \
-    -framework MetalKit \
-    -framework MetalPerformanceShaders
-
 cleanup
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -2,16 +2,14 @@
 # This script is intended to run inside the go generate
 # working directory must be llm/generate/

-# First we build our default built-in library which will be linked into the CGO
-# binary as a normal dependency. This default build is CPU based.
+# First we build one or more CPU based LLM libraries
 #
-# Then we build a CUDA dynamic library (although statically linked with the CUDA
-# library dependencies for maximum portability)
+# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
+# library dependencies
 #
-# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  ROCm is particularly
-# important to be a dynamic lib even if it's the only GPU library detected because
-# we can't redistribute the objectfiles but must rely on dynamic libraries at
-# runtime, which could lead the server not to start if not present.
+# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  The ROCM
+# libraries are quite large, and also dynamically load data files at runtime
+# which in turn are large, so we don't attempt to cary them as payload

 set -ex
 set -o pipefail
@ -59,11 +57,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
        echo "Building custom CPU"
        build
-        install
-        link_server_lib
+        compress_libs
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
        # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
@ -80,11 +77,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
        #
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
        echo "Building LCD CPU"
        build
-        install
-        link_server_lib
+        compress_libs

        #
        # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
@ -92,11 +88,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
        echo "Building AVX CPU"
        build
-        install
-        link_server_lib
+        compress_libs

        #
        # ~2013 CPU Dynamic library
@ -104,11 +99,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx2"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
        echo "Building AVX2 CPU"
        build
-        install
-        link_server_lib
+        compress_libs
    fi
 else
    echo "Skipping CPU generation step as requested"
@ -127,22 +121,27 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda${CUDA_VARIANT}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
+    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    build
-    install
-    gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
-        -Wl,--whole-archive \
-        ${BUILD_DIR}/lib/libext_server.a \
-        ${BUILD_DIR}/lib/libcommon.a \
-        ${BUILD_DIR}/lib/libllama.a \
-        -Wl,--no-whole-archive \
-        ${CUDA_LIB_DIR}/libcudart_static.a \
-        ${CUDA_LIB_DIR}/libcublas_static.a \
-        ${CUDA_LIB_DIR}/libcublasLt_static.a \
-        ${CUDA_LIB_DIR}/libcudadevrt.a \
-        ${CUDA_LIB_DIR}/libculibos.a \
-        -lcuda \
-        -lrt -lpthread -ldl -lstdc++ -lm
+
+    # Cary the CUDA libs as payloads to help reduce dependency burden on users
+    #
+    # TODO - in the future we may shift to packaging these separately and conditionally
+    #        downloading them in the install script.
+    DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
+    for lib in libcudart.so libcublas.so libcublasLt.so ; do
+        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
+        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
+            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
+        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
+            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
+        else
+            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
+        fi
+    done
+    compress_libs
+
 fi

 if [ -z "${ROCM_PATH}" ]; then
@ -164,19 +163,13 @@ if [ -d "${ROCM_PATH}" ]; then
    fi
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm${ROCM_VARIANT}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
+    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build
-    install
-    gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
-        -Wl,--whole-archive \
-        ${BUILD_DIR}/lib/libext_server.a \
-        ${BUILD_DIR}/lib/libcommon.a \
-        ${BUILD_DIR}/lib/libllama.a \
-        -Wl,--no-whole-archive \
-        -lrt -lpthread -ldl -lstdc++ -lm \
-        -L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \
-        -Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \
-        -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu
+
+    # Note: the ROCM libs and runtime library files are too large to embed, so we depend on
+    #       them being present at runtime on the host
+    compress_libs
 fi

 cleanup
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -5,7 +5,8 @@ $ErrorActionPreference = "Stop"
 function init_vars {
    $script:llamacppDir = "../llama.cpp"
    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off",  "-A","x64")
-    $script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
+    $script:cmakeTargets = @("ext_server")
+    $script:ARCH = "amd64" # arm not yet supported.
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
        $script:config = "RelWithDebInfo"
@ -13,6 +14,17 @@ function init_vars {
        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
        $script:config = "Release"
    }
+    # Try to find the CUDA dir
+    if ($env:CUDA_LIB_DIR -eq $null) {
+        $d=(get-command -ea 'silentlycontinue' nvcc).path
+        if ($d -ne $null) {
+            $script:CUDA_LIB_DIR=($d| split-path -parent)
+        }
+    } else {
+        $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
+    }
+    $script:BZIP2=(get-command -ea 'silentlycontinue' bzip2).path
+    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
 }

 function git_module_setup {
@ -47,11 +59,25 @@ function build {
 function install {
    rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
    md "${script:buildDir}/lib" -ea 0 > $null
-    cp "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" "${script:buildDir}/lib"
+    cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
    cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"

    # Display the dll dependencies in the build log
-    dumpbin /dependents "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" | select-string ".dll"
+    if ($script:DUMPBIN -ne $null) {
+        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
+    }
+}
+
+function compress_libs {
+    if ($script:BZIP2 -eq $null) {
+        write-host "bzip2 not installed, not compressing files"
+        return
+    }
+    write-host "Compressing dlls..."
+    $libs = dir "${script:buildDir}/lib/*.dll"
+    foreach ($file in $libs) {
+        & "$script:BZIP2" -v9 $file
+    }
 }

 function cleanup {
@ -71,33 +97,47 @@ apply_patches
 $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")

 $script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-$script:buildDir="${script:llamacppDir}/build/windows/cpu"
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
 write-host "Building LCD CPU"
 build
 install
+compress_libs

 $script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx"
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
 write-host "Building AVX CPU"
 build
 install
+compress_libs

 $script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
-$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx2"
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
 write-host "Building AVX2 CPU"
 build
 install
+compress_libs

-# Then build cuda as a dynamically loaded library
-# TODO figure out how to detect cuda version
-init_vars
-$script:buildDir="${script:llamacppDir}/build/windows/cuda"
-$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
-build
-install
-
+if ($null -ne $script:CUDA_LIB_DIR) {
+    # Then build cuda as a dynamically loaded library
+    $nvcc = (get-command -ea 'silentlycontinue' nvcc)
+    if ($null -ne $nvcc) {
+        $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
+    }
+    if ($null -ne $script:CUDA_VERSION) {
+        $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
+    }
+    init_vars
+    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
+    $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
+    build
+    install
+    cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib"
+    cp "${script:CUDA_LIB_DIR}/cublas64_*.dll" "${script:buildDir}/lib"
+    cp "${script:CUDA_LIB_DIR}/cublasLt64_*.dll" "${script:buildDir}/lib"
+    compress_libs
+}
 # TODO - actually implement ROCm support on windows
-$script:buildDir="${script:llamacppDir}/build/windows/rocm"
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm"

 rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
 md "${script:buildDir}/lib" -ea 0 > $null
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@ -1,9 +1,9 @@
 package llm

 import (
+	"compress/bzip2"
 	"errors"
 	"fmt"
-	"golang.org/x/exp/slices"
 	"io"
 	"io/fs"
 	"log"
@ -12,6 +12,9 @@ import (
 	"runtime"
 	"strings"

+	"golang.org/x/exp/slices"
+	"golang.org/x/sync/errgroup"
+
 	"github.com/jmorganca/ollama/gpu"
 )

@ -20,7 +23,7 @@ import (
 // Any library without a variant is the lowest common denominator
 var availableDynLibs = map[string]string{}

-const pathComponentCount = 6
+const pathComponentCount = 7

 // getDynLibs returns an ordered list of LLM libraries to try, starting with the best
 func getDynLibs(gpuInfo gpu.GpuInfo) []string {
@ -100,6 +103,7 @@ func rocmDynLibPresent() bool {
 }

 func nativeInit(workdir string) error {
+	log.Printf("Extracting dynamic libraries...")
 	if runtime.GOOS == "darwin" {
 		err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
 		if err != nil {
@ -113,7 +117,7 @@ func nativeInit(workdir string) error {
 		os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
 	}

-	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
+	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/*/lib/*")
 	if err != nil {
 		if err == payloadMissing {
 			log.Printf("%s", payloadMissing)
@ -151,45 +155,61 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
 	}
 	libs := []string{}

+	// TODO consider making this idempotent with some sort of persistent directory (where we store models probably)
+	// and tracking by version so we don't reexpand the files every time
+	// Also maybe consider lazy loading only what is needed
+
+	g := new(errgroup.Group)
 	for _, file := range files {
 		pathComps := strings.Split(file, "/")
 		if len(pathComps) != pathComponentCount {
 			log.Printf("unexpected payload components: %v", pathComps)
 			continue
 		}
-		// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
-		// Include the variant in the path to avoid conflicts between multiple server libs
-		targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
-		srcFile, err := libEmbed.Open(file)
-		if err != nil {
-			return nil, fmt.Errorf("read payload %s: %v", file, err)
-		}
-		defer srcFile.Close()
-		if err := os.MkdirAll(targetDir, 0o755); err != nil {
-			return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
-		}

-		destFile := filepath.Join(targetDir, filepath.Base(file))
-		if strings.Contains(destFile, "server") {
-			libs = append(libs, destFile)
-		}
-
-		_, err = os.Stat(destFile)
-		switch {
-		case errors.Is(err, os.ErrNotExist):
-			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+		file := file
+		g.Go(func() error {
+			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
+			// Include the variant in the path to avoid conflicts between multiple server libs
+			targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
+			srcFile, err := libEmbed.Open(file)
 			if err != nil {
-				return nil, fmt.Errorf("write payload %s: %v", file, err)
+				return fmt.Errorf("read payload %s: %v", file, err)
 			}
-			defer destFile.Close()
-			if _, err := io.Copy(destFile, srcFile); err != nil {
-				return nil, fmt.Errorf("copy payload %s: %v", file, err)
+			defer srcFile.Close()
+			if err := os.MkdirAll(targetDir, 0o755); err != nil {
+				return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
 			}
-		case err != nil:
-			return nil, fmt.Errorf("stat payload %s: %v", file, err)
-		}
+			src := io.Reader(srcFile)
+			filename := file
+			if strings.HasSuffix(file, ".bz2") {
+				src = bzip2.NewReader(src)
+				filename = strings.TrimSuffix(filename, ".bz2")
+			}
+
+			destFile := filepath.Join(targetDir, filepath.Base(filename))
+			if strings.Contains(destFile, "server") {
+				libs = append(libs, destFile)
+			}
+
+			_, err = os.Stat(destFile)
+			switch {
+			case errors.Is(err, os.ErrNotExist):
+				destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+				if err != nil {
+					return fmt.Errorf("write payload %s: %v", file, err)
+				}
+				defer destFile.Close()
+				if _, err := io.Copy(destFile, src); err != nil {
+					return fmt.Errorf("copy payload %s: %v", file, err)
+				}
+			case err != nil:
+				return fmt.Errorf("stat payload %s: %v", file, err)
+			}
+			return nil
+		})
 	}
-	return libs, nil
+	return libs, g.Wait()
 }

 func extractPayloadFiles(workDir, glob string) error {
@ -207,8 +227,14 @@ func extractPayloadFiles(workDir, glob string) error {
 		if err := os.MkdirAll(workDir, 0o755); err != nil {
 			return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
 		}
+		src := io.Reader(srcFile)
+		filename := file
+		if strings.HasSuffix(file, ".bz2") {
+			src = bzip2.NewReader(src)
+			filename = strings.TrimSuffix(filename, ".bz2")
+		}

-		destFile := filepath.Join(workDir, filepath.Base(file))
+		destFile := filepath.Join(workDir, filepath.Base(filename))
 		_, err = os.Stat(destFile)
 		switch {
 		case errors.Is(err, os.ErrNotExist):
@ -217,7 +243,7 @@ func extractPayloadFiles(workDir, glob string) error {
 				return fmt.Errorf("write payload %s: %v", file, err)
 			}
 			defer destFile.Close()
-			if _, err := io.Copy(destFile, srcFile); err != nil {
+			if _, err := io.Copy(destFile, src); err != nil {
 				return fmt.Errorf("copy payload %s: %v", file, err)
 			}
 		case err != nil:
--- a/llm/payload_darwin.go
+++ b/llm/payload_darwin.go
@ -1,8 +0,0 @@
-package llm
-
-import (
-	"embed"
-)
-
-//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/*/lib/*.so
-var libEmbed embed.FS
--- a/llm/payload_darwin_amd64.go
+++ b/llm/payload_darwin_amd64.go
@ -0,0 +1,8 @@
+package llm
+
+import (
+	"embed"
+)
+
+//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
+var libEmbed embed.FS
--- a/llm/payload_darwin_arm64.go
+++ b/llm/payload_darwin_arm64.go
@ -0,0 +1,8 @@
+package llm
+
+import (
+	"embed"
+)
+
+//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
+var libEmbed embed.FS
--- a/llm/payload_linux.go
+++ b/llm/payload_linux.go
@ -4,5 +4,5 @@ import (
 	"embed"
 )

-//go:embed llama.cpp/build/linux/*/lib/*.so
+//go:embed llama.cpp/build/linux/*/*/lib/*.so*
 var libEmbed embed.FS
--- a/llm/payload_windows.go
+++ b/llm/payload_windows.go
@ -4,5 +4,5 @@ import (
 	"embed"
 )

-//go:embed llama.cpp/build/windows/*/lib/*.dll
+//go:embed llama.cpp/build/windows/*/*/lib/*.dll*
 var libEmbed embed.FS
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@ -1,6 +1,6 @@
 #!/bin/sh

-set -eu
+set -e

 export VERSION=${VERSION:-0.0.0}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
@ -11,21 +11,36 @@ for TARGETARCH in arm64 amd64; do
    rm -rf llm/llama.cpp/build
    GOOS=darwin GOARCH=$TARGETARCH go generate ./...
    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
+    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -cover -o dist/ollama-darwin-$TARGETARCH-cov
 done

-lipo -create -output dist/ollama dist/ollama-darwin-*
-rm -f dist/ollama-darwin-*
-codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
+lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
+rm -f dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
+if [ -n "$APPLE_IDENTITY" ]; then
+    codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
+else
+    echo "Skipping code signing - set APPLE_IDENTITY"
+fi
 chmod +x dist/ollama

-# build and sign the mac app
+# build and optionally sign the mac app
 npm install --prefix app
-npm run --prefix app make:sign
+if [ -n "$APPLE_IDENTITY" ]; then
+    npm run --prefix app make:sign
+else 
+    npm run --prefix app make
+fi
 cp app/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip

 # sign the binary and rename it
-codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
+if [ -n "$APPLE_IDENTITY" ]; then
+    codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
+else
+    echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
+fi
 ditto -c -k --keepParent dist/ollama dist/temp.zip
-xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
+if [ -n "$APPLE_IDENTITY" ]; then
+    xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
+fi
 mv dist/ollama dist/ollama-darwin
 rm -f dist/temp.zip
--- a/scripts/build_remote.py
+++ b/scripts/build_remote.py
@ -66,3 +66,7 @@ subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'generate', './...
 print("Building")
 subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.'])

+print("Copying built result")
+subprocess.check_call(['scp', netloc +":"+ path + "/ollama.exe",  './dist/'])
+
+
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@ -28,6 +28,7 @@ fi

 if [ -n "${CMAKE_VERSION}" ]; then
    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
+    dnf install -y bzip2
 fi

 if [ -n "${GOLANG_VERSION}" ]; then