Merge pull request #2007 from dhiltgen/cpu_fallback

Add multiple CPU variants for Intel Mac
2024-01-18 11:32:29 -08:00 · 2024-01-18 11:32:29 -08:00 · df40b11d03
commit df40b11d03
parent d5a7353357 b992bf65fc
18 changed files with 321 additions and 186 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -79,13 +79,16 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
-        arch: [amd64, arm64]
+        arch: [amd64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
          - os: windows-latest
            arch: arm64
    runs-on: ${{ matrix.os }}
    env:
      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: "1"
    steps:
      - uses: actions/checkout@v4
        with:
--- a/Dockerfile.build
+++ b/Dockerfile.build
@ -10,6 +10,7 @@ COPY llm llm
 FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
 ARG CMAKE_VERSION
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
@ -19,6 +20,7 @@ RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
 ARG CMAKE_VERSION
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
@ -28,6 +30,7 @@ RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
 ARG CMAKE_VERSION
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
@ -38,6 +41,7 @@ RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
 ARG CMAKE_VERSION
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
@ -50,6 +54,7 @@ FROM --platform=linux/amd64 centos:7 AS cpu-build-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
@ -61,6 +66,7 @@ FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
@ -72,7 +78,7 @@ RUN sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
 ENV CGO_ENABLED 1
 ARG GOFLAGS
-ARG CGO_FLAGS
+ARG CGO_CFLAGS
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
 COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
@ -84,7 +90,7 @@ FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
 ENV CGO_ENABLED 1
 ARG GOLANG_VERSION
 ARG GOFLAGS
-ARG CGO_FLAGS
+ARG CGO_CFLAGS
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
 COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@ -5,7 +5,7 @@
 #ifdef __linux__
 #include <dlfcn.h>
-#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags | RTLD_DEEPBIND)
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
 #define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
 #define LOAD_ERR() strdup(dlerror())
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
@ -58,8 +58,8 @@ void dyn_init(const char *libPath, struct dynamic_llama_server *s,
      {"", NULL},
  };
-  printf("loading %s library\n", libPath);
+  printf("loading library %s\n", libPath);
-  s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
+  s->handle = LOAD_LIBRARY(libPath, RTLD_GLOBAL|RTLD_NOW);
  if (!s->handle) {
    err->id = -1;
    char *msg = LOAD_ERR();
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@ -372,15 +372,6 @@ func updatePath(dir string) {
 		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
 		log.Printf("Updating PATH to %s", newPath)
 		os.Setenv("PATH", newPath)
 	} else {
 		pathComponents := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
 		for _, comp := range pathComponents {
 			if comp == dir {
 				return
 			}
 		}
 		newPath := strings.Join(append([]string{dir}, pathComponents...), ":")
 		log.Printf("Updating LD_LIBRARY_PATH to %s", newPath)
 		os.Setenv("LD_LIBRARY_PATH", newPath)
 	}
 	// linux and darwin rely on rpath
 }
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@ -2,28 +2,24 @@
 set(TARGET ext_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp)
+if (WIN32)
    add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp)
 else()
    add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp)
 endif()
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_include_directories(${TARGET} PRIVATE ../..)
 target_include_directories(${TARGET} PRIVATE ../../..)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
-target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE ggml llava common )
-target_compile_definitions(${TARGET} PRIVATE
+set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
+target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
-)
+install(TARGETS ext_server LIBRARY)
 if (BUILD_SHARED_LIBS)
    set_target_properties(ext_server PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(ext_server PRIVATE LLAMA_SHARED LLAMA_BUILD)
    add_library(ext_server_shared SHARED $<TARGET_OBJECTS:ext_server>)
    target_link_libraries(ext_server_shared PRIVATE ggml llama llava common ${CMAKE_THREAD_LIBS_INIT})
    install(TARGETS ext_server_shared LIBRARY)
 endif()
 if (CUDAToolkit_FOUND)
    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
    if (WIN32)
-        target_link_libraries(ext_server_shared PRIVATE nvml)
+        target_link_libraries(${TARGET} PRIVATE nvml)
    endif()
 endif()
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@ -1,15 +1,44 @@
 # common logic accross linux and darwin
 init_vars() {
    case "${GOARCH}" in
    "amd64")
        ARCH="x86_64"
        ;;
    "arm64")
        ARCH="arm64"
        ;;
    *)
        ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
    esac
    LLAMACPP_DIR=../llama.cpp
    CMAKE_DEFS=""
-    CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
+    CMAKE_TARGETS="--target ext_server"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on"
+        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
    else
        # TODO - add additional optimization flags...
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off"
+        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
    fi
    case $(uname -s) in 
    "Darwin")
        LIB_EXT="dylib"
        WHOLE_ARCHIVE="-Wl,-force_load"
        NO_WHOLE_ARCHIVE=""
        GCC_ARCH="-arch ${ARCH}"
        ;;
    "Linux")
        LIB_EXT="so"
        WHOLE_ARCHIVE="-Wl,--whole-archive"
        NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
        # Cross compiling not supported on linux - Use docker
        GCC_ARCH=""
        ;;
    *)
        ;;
    esac
 }
 git_module_setup() {
@ -40,25 +69,29 @@ apply_patches() {
 build() {
    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
    mkdir -p ${BUILD_DIR}/lib/
    g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
        ${GCC_ARCH} \
        ${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \
        ${BUILD_DIR}/common/libcommon.a \
        ${BUILD_DIR}/libllama.a \
        -Wl,-rpath,\$ORIGIN \
        -lpthread -ldl -lm \
        ${EXTRA_LIBS}
 }
-install() {
+compress_libs() {
-    rm -rf ${BUILD_DIR}/lib
+    echo "Compressing payloads to reduce overall binary size..."
-    mkdir -p ${BUILD_DIR}/lib
+    pids=""
-    cp ${BUILD_DIR}/examples/server/libext_server.a ${BUILD_DIR}/lib
+    for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
-    cp ${BUILD_DIR}/common/libcommon.a ${BUILD_DIR}/lib
+        bzip2 -v9 ${lib} &
-    cp ${BUILD_DIR}/libllama.a ${BUILD_DIR}/lib
+        pids+=" $!"
-    cp ${BUILD_DIR}/libggml_static.a ${BUILD_DIR}/lib
+    done
-}
+    echo 
-
+    for pid in ${pids}; do
-link_server_lib() {
+        wait $pid
-    gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
+    done
-        -Wl,--whole-archive \
+    echo "Finished compression"
        ${BUILD_DIR}/lib/libext_server.a \
        -Wl,--no-whole-archive \
        ${BUILD_DIR}/lib/libcommon.a \
        ${BUILD_DIR}/lib/libllama.a \
        -lstdc++
 }
 # Keep the local tree clean after we're done with the build
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@ -9,16 +9,52 @@ set -o pipefail
 echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
-CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
+git_module_setup
-BUILD_DIR="${LLAMACPP_DIR}/build/darwin/metal"
+apply_patches
 COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_ACCELERATE=off"
 case "${GOARCH}" in
 "amd64")
-    CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
-    ARCH="x86_64"
+
    #
    # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
    #
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
    echo "Building LCD CPU"
    build
    compress_libs
    #
    # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
    # Approximately 400% faster than LCD on same CPU
    #
    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
    echo "Building AVX CPU"
    build
    compress_libs
    #
    # ~2013 CPU Dynamic library
    # Approximately 10% faster than AVX on same CPU
    #
    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
    echo "Building AVX2 CPU"
    build
    compress_libs
    ;;
 "arm64")
-    CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DLLAMA_METAL=on ${CMAKE_DEFS}"
+    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
-    ARCH="arm64"
+    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
    build
    compress_libs
    ;;
 *)
    echo "GOARCH must be set"
@ -27,21 +63,4 @@ case "${GOARCH}" in
    ;;
 esac
 git_module_setup
 apply_patches
 build
 install
 gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
    -arch ${ARCH} \
    -Wl,-force_load ${BUILD_DIR}/lib/libext_server.a \
    ${BUILD_DIR}/lib/libcommon.a \
    ${BUILD_DIR}/lib/libllama.a \
    ${BUILD_DIR}/lib/libggml_static.a \
    -lpthread -ldl -lm -lc++ \
    -framework Accelerate \
    -framework Foundation \
    -framework Metal \
    -framework MetalKit \
    -framework MetalPerformanceShaders
 cleanup
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -2,16 +2,14 @@
 # This script is intended to run inside the go generate
 # working directory must be llm/generate/
-# First we build our default built-in library which will be linked into the CGO
+# First we build one or more CPU based LLM libraries
 # binary as a normal dependency. This default build is CPU based.
 #
-# Then we build a CUDA dynamic library (although statically linked with the CUDA
+# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
-# library dependencies for maximum portability)
+# library dependencies
 #
-# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  ROCm is particularly
+# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  The ROCM
-# important to be a dynamic lib even if it's the only GPU library detected because
+# libraries are quite large, and also dynamically load data files at runtime
-# we can't redistribute the objectfiles but must rely on dynamic libraries at
+# which in turn are large, so we don't attempt to cary them as payload
 # runtime, which could lead the server not to start if not present.
 set -ex
 set -o pipefail
@ -59,11 +57,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
        echo "Building custom CPU"
        build
-        install
+        compress_libs
        link_server_lib
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
        # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
@ -80,11 +77,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
        #
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
        echo "Building LCD CPU"
        build
-        install
+        compress_libs
        link_server_lib
        #
        # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
@ -92,11 +88,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
        echo "Building AVX CPU"
        build
-        install
+        compress_libs
        link_server_lib
        #
        # ~2013 CPU Dynamic library
@ -104,11 +99,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx2"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
        echo "Building AVX2 CPU"
        build
-        install
+        compress_libs
        link_server_lib
    fi
 else
    echo "Skipping CPU generation step as requested"
@ -127,22 +121,27 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda${CUDA_VARIANT}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    build
-    install
+
-    gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
+    # Cary the CUDA libs as payloads to help reduce dependency burden on users
-        -Wl,--whole-archive \
+    #
-        ${BUILD_DIR}/lib/libext_server.a \
+    # TODO - in the future we may shift to packaging these separately and conditionally
-        ${BUILD_DIR}/lib/libcommon.a \
+    #        downloading them in the install script.
-        ${BUILD_DIR}/lib/libllama.a \
+    DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
-        -Wl,--no-whole-archive \
+    for lib in libcudart.so libcublas.so libcublasLt.so ; do
-        ${CUDA_LIB_DIR}/libcudart_static.a \
+        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
-        ${CUDA_LIB_DIR}/libcublas_static.a \
+        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
-        ${CUDA_LIB_DIR}/libcublasLt_static.a \
+            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
-        ${CUDA_LIB_DIR}/libcudadevrt.a \
+        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
-        ${CUDA_LIB_DIR}/libculibos.a \
+            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
-        -lcuda \
+        else
-        -lrt -lpthread -ldl -lstdc++ -lm
+            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
        fi
    done
    compress_libs
 fi
 if [ -z "${ROCM_PATH}" ]; then
@ -164,19 +163,13 @@ if [ -d "${ROCM_PATH}" ]; then
    fi
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm${ROCM_VARIANT}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build
-    install
+
-    gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
+    # Note: the ROCM libs and runtime library files are too large to embed, so we depend on
-        -Wl,--whole-archive \
+    #       them being present at runtime on the host
-        ${BUILD_DIR}/lib/libext_server.a \
+    compress_libs
        ${BUILD_DIR}/lib/libcommon.a \
        ${BUILD_DIR}/lib/libllama.a \
        -Wl,--no-whole-archive \
        -lrt -lpthread -ldl -lstdc++ -lm \
        -L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \
        -Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \
        -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu
 fi
 cleanup
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -5,7 +5,8 @@ $ErrorActionPreference = "Stop"
 function init_vars {
    $script:llamacppDir = "../llama.cpp"
    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off",  "-A","x64")
-    $script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
+    $script:cmakeTargets = @("ext_server")
    $script:ARCH = "amd64" # arm not yet supported.
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
        $script:config = "RelWithDebInfo"
@ -13,6 +14,17 @@ function init_vars {
        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
        $script:config = "Release"
    }
    # Try to find the CUDA dir
    if ($env:CUDA_LIB_DIR -eq $null) {
        $d=(get-command -ea 'silentlycontinue' nvcc).path
        if ($d -ne $null) {
            $script:CUDA_LIB_DIR=($d| split-path -parent)
        }
    } else {
        $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
    }
    $script:BZIP2=(get-command -ea 'silentlycontinue' bzip2).path
    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
 }
 function git_module_setup {
@ -47,11 +59,25 @@ function build {
 function install {
    rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
    md "${script:buildDir}/lib" -ea 0 > $null
-    cp "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" "${script:buildDir}/lib"
+    cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
    cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
    # Display the dll dependencies in the build log
-    dumpbin /dependents "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" | select-string ".dll"
+    if ($script:DUMPBIN -ne $null) {
        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
    }
 }
 function compress_libs {
    if ($script:BZIP2 -eq $null) {
        write-host "bzip2 not installed, not compressing files"
        return
    }
    write-host "Compressing dlls..."
    $libs = dir "${script:buildDir}/lib/*.dll"
    foreach ($file in $libs) {
        & "$script:BZIP2" -v9 $file
    }
 }
 function cleanup {
@ -71,33 +97,47 @@ apply_patches
 $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")
 $script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-$script:buildDir="${script:llamacppDir}/build/windows/cpu"
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
 write-host "Building LCD CPU"
 build
 install
 compress_libs
 $script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx"
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
 write-host "Building AVX CPU"
 build
 install
 compress_libs
 $script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
-$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx2"
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
 write-host "Building AVX2 CPU"
 build
 install
 compress_libs
-# Then build cuda as a dynamically loaded library
+if ($null -ne $script:CUDA_LIB_DIR) {
-# TODO figure out how to detect cuda version
+    # Then build cuda as a dynamically loaded library
-init_vars
+    $nvcc = (get-command -ea 'silentlycontinue' nvcc)
-$script:buildDir="${script:llamacppDir}/build/windows/cuda"
+    if ($null -ne $nvcc) {
-$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
+        $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
-build
+    }
-install
+    if ($null -ne $script:CUDA_VERSION) {
-
+        $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
    }
    init_vars
    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
    $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
    build
    install
    cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib"
    cp "${script:CUDA_LIB_DIR}/cublas64_*.dll" "${script:buildDir}/lib"
    cp "${script:CUDA_LIB_DIR}/cublasLt64_*.dll" "${script:buildDir}/lib"
    compress_libs
 }
 # TODO - actually implement ROCm support on windows
-$script:buildDir="${script:llamacppDir}/build/windows/rocm"
+$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm"
 rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
 md "${script:buildDir}/lib" -ea 0 > $null
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@ -1,9 +1,9 @@
 package llm
 import (
 	"compress/bzip2"
 	"errors"
 	"fmt"
 	"golang.org/x/exp/slices"
 	"io"
 	"io/fs"
 	"log"
@ -12,6 +12,9 @@ import (
 	"runtime"
 	"strings"
 	"golang.org/x/exp/slices"
 	"golang.org/x/sync/errgroup"
 	"github.com/jmorganca/ollama/gpu"
 )
@ -20,7 +23,7 @@ import (
 // Any library without a variant is the lowest common denominator
 var availableDynLibs = map[string]string{}
-const pathComponentCount = 6
+const pathComponentCount = 7
 // getDynLibs returns an ordered list of LLM libraries to try, starting with the best
 func getDynLibs(gpuInfo gpu.GpuInfo) []string {
@ -100,6 +103,7 @@ func rocmDynLibPresent() bool {
 }
 func nativeInit(workdir string) error {
 	log.Printf("Extracting dynamic libraries...")
 	if runtime.GOOS == "darwin" {
 		err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
 		if err != nil {
@ -113,7 +117,7 @@ func nativeInit(workdir string) error {
 		os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
 	}
-	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
+	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/*/lib/*")
 	if err != nil {
 		if err == payloadMissing {
 			log.Printf("%s", payloadMissing)
@ -151,25 +155,39 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
 	}
 	libs := []string{}
 	// TODO consider making this idempotent with some sort of persistent directory (where we store models probably)
 	// and tracking by version so we don't reexpand the files every time
 	// Also maybe consider lazy loading only what is needed
 	g := new(errgroup.Group)
 	for _, file := range files {
 		pathComps := strings.Split(file, "/")
 		if len(pathComps) != pathComponentCount {
 			log.Printf("unexpected payload components: %v", pathComps)
 			continue
 		}
-		// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
+
 		file := file
 		g.Go(func() error {
 			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
 			// Include the variant in the path to avoid conflicts between multiple server libs
 			targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
 			srcFile, err := libEmbed.Open(file)
 			if err != nil {
-			return nil, fmt.Errorf("read payload %s: %v", file, err)
+				return fmt.Errorf("read payload %s: %v", file, err)
 			}
 			defer srcFile.Close()
 			if err := os.MkdirAll(targetDir, 0o755); err != nil {
-			return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
+				return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
 			}
 			src := io.Reader(srcFile)
 			filename := file
 			if strings.HasSuffix(file, ".bz2") {
 				src = bzip2.NewReader(src)
 				filename = strings.TrimSuffix(filename, ".bz2")
 			}
-		destFile := filepath.Join(targetDir, filepath.Base(file))
+			destFile := filepath.Join(targetDir, filepath.Base(filename))
 			if strings.Contains(destFile, "server") {
 				libs = append(libs, destFile)
 			}
@ -179,17 +197,19 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
 			case errors.Is(err, os.ErrNotExist):
 				destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 				if err != nil {
-				return nil, fmt.Errorf("write payload %s: %v", file, err)
+					return fmt.Errorf("write payload %s: %v", file, err)
 				}
 				defer destFile.Close()
-			if _, err := io.Copy(destFile, srcFile); err != nil {
+				if _, err := io.Copy(destFile, src); err != nil {
-				return nil, fmt.Errorf("copy payload %s: %v", file, err)
+					return fmt.Errorf("copy payload %s: %v", file, err)
 				}
 			case err != nil:
-			return nil, fmt.Errorf("stat payload %s: %v", file, err)
+				return fmt.Errorf("stat payload %s: %v", file, err)
 			}
 			return nil
 		})
 	}
-	return libs, nil
+	return libs, g.Wait()
 }
 func extractPayloadFiles(workDir, glob string) error {
@ -207,8 +227,14 @@ func extractPayloadFiles(workDir, glob string) error {
 		if err := os.MkdirAll(workDir, 0o755); err != nil {
 			return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
 		}
 		src := io.Reader(srcFile)
 		filename := file
 		if strings.HasSuffix(file, ".bz2") {
 			src = bzip2.NewReader(src)
 			filename = strings.TrimSuffix(filename, ".bz2")
 		}
-		destFile := filepath.Join(workDir, filepath.Base(file))
+		destFile := filepath.Join(workDir, filepath.Base(filename))
 		_, err = os.Stat(destFile)
 		switch {
 		case errors.Is(err, os.ErrNotExist):
@ -217,7 +243,7 @@ func extractPayloadFiles(workDir, glob string) error {
 				return fmt.Errorf("write payload %s: %v", file, err)
 			}
 			defer destFile.Close()
-			if _, err := io.Copy(destFile, srcFile); err != nil {
+			if _, err := io.Copy(destFile, src); err != nil {
 				return fmt.Errorf("copy payload %s: %v", file, err)
 			}
 		case err != nil:
--- a/llm/payload_darwin.go
+++ b/llm/payload_darwin.go
@ -1,8 +0,0 @@
 package llm
 import (
 	"embed"
 )
 //go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/*/lib/*.so
 var libEmbed embed.FS
--- a/llm/payload_darwin_amd64.go
+++ b/llm/payload_darwin_amd64.go
@ -0,0 +1,8 @@
 package llm
 import (
 	"embed"
 )
 //go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
 var libEmbed embed.FS
--- a/llm/payload_darwin_arm64.go
+++ b/llm/payload_darwin_arm64.go
@ -0,0 +1,8 @@
 package llm
 import (
 	"embed"
 )
 //go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
 var libEmbed embed.FS
--- a/llm/payload_linux.go
+++ b/llm/payload_linux.go
@ -4,5 +4,5 @@ import (
 	"embed"
 )
-//go:embed llama.cpp/build/linux/*/lib/*.so
+//go:embed llama.cpp/build/linux/*/*/lib/*.so*
 var libEmbed embed.FS
--- a/llm/payload_windows.go
+++ b/llm/payload_windows.go
@ -4,5 +4,5 @@ import (
 	"embed"
 )
-//go:embed llama.cpp/build/windows/*/lib/*.dll
+//go:embed llama.cpp/build/windows/*/*/lib/*.dll*
 var libEmbed embed.FS
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@ -1,6 +1,6 @@
 #!/bin/sh
-set -eu
+set -e
 export VERSION=${VERSION:-0.0.0}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'"
@ -11,21 +11,36 @@ for TARGETARCH in arm64 amd64; do
    rm -rf llm/llama.cpp/build
    GOOS=darwin GOARCH=$TARGETARCH go generate ./...
    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -cover -o dist/ollama-darwin-$TARGETARCH-cov
 done
-lipo -create -output dist/ollama dist/ollama-darwin-*
+lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
-rm -f dist/ollama-darwin-*
+rm -f dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
-codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
+if [ -n "$APPLE_IDENTITY" ]; then
    codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
 else
    echo "Skipping code signing - set APPLE_IDENTITY"
 fi
 chmod +x dist/ollama
-# build and sign the mac app
+# build and optionally sign the mac app
 npm install --prefix app
-npm run --prefix app make:sign
+if [ -n "$APPLE_IDENTITY" ]; then
    npm run --prefix app make:sign
 else 
    npm run --prefix app make
 fi
 cp app/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
 # sign the binary and rename it
-codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
+if [ -n "$APPLE_IDENTITY" ]; then
    codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
 else
    echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
 fi
 ditto -c -k --keepParent dist/ollama dist/temp.zip
-xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
+if [ -n "$APPLE_IDENTITY" ]; then
    xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
 fi
 mv dist/ollama dist/ollama-darwin
 rm -f dist/temp.zip
--- a/scripts/build_remote.py
+++ b/scripts/build_remote.py
@ -66,3 +66,7 @@ subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'generate', './...
 print("Building")
 subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.'])
 print("Copying built result")
 subprocess.check_call(['scp', netloc +":"+ path + "/ollama.exe",  './dist/'])
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@ -28,6 +28,7 @@ fi
 if [ -n "${CMAKE_VERSION}" ]; then
    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
    dnf install -y bzip2
 fi
 if [ -n "${GOLANG_VERSION}" ]; then