From 8da7bef05f6d14dff244860db75d84cc0995e0dd Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 5 Jan 2024 12:13:08 -0800
Subject: [PATCH 1/4] Support multiple variants for a given llm lib type

In some cases we may want multiple variants for a given GPU type or CPU.
This adds logic to have an optional Variant which we can use to select
an optimal library, but also allows us to try multiple variants in case
some fail to load.

This can be useful for scenarios such as ROCm v5 vs v6 incompatibility
or potentially CPU features.
---
 Dockerfile.build               |  52 +++++---
 gpu/gpu.go                     |   9 ++
 gpu/gpu_info_rocm.c            |  28 +++-
 gpu/gpu_info_rocm.h            |  14 ++
 gpu/types.go                   |   3 +
 llm/dynamic_shim.c             |   2 +-
 llm/ext_server/README.md       |  18 ++-
 llm/ext_server_windows.go      |   5 +-
 llm/generate/gen_linux.sh      |  33 +++--
 llm/llm.go                     |  18 +--
 llm/shim.go                    | 228 +++++++++++++++++++++++++++++++++
 llm/shim_darwin.go             |  55 --------
 llm/shim_ext_server.go         |  86 -------------
 llm/shim_ext_server_linux.go   |  23 ----
 llm/shim_ext_server_windows.go |   5 -
 llm/shim_test.go               |  61 +++++++++
 16 files changed, 428 insertions(+), 212 deletions(-)
 create mode 100644 llm/shim.go
 create mode 100644 llm/shim_test.go

diff --git a/Dockerfile.build b/Dockerfile.build
index ca6b1a29..9ba44398 100644
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -1,7 +1,6 @@
 ARG GOLANG_VERSION=1.21.3
 ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION=11.3.1
-ARG ROCM_VERSION=5.7.1
 
 FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
 
@@ -16,9 +15,11 @@ ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CM
 RUN tar -zx -C /usr --strip-components 1 </tmp/cmake-$CMAKE_VERSION.tar.gz
 
 WORKDIR /go/src/github.com/jmorganca/ollama
-COPY . .
+COPY .git .git
+COPY .gitmodules .gitmodules
+COPY llm llm
 
-WORKDIR llm/generate
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 RUN sh gen_linux.sh
 
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
@@ -28,30 +29,46 @@ ARG CMAKE_VERSION
 RUN dnf install -y git cmake
 
 WORKDIR /go/src/github.com/jmorganca/ollama
-COPY . .
+COPY .git .git
+COPY .gitmodules .gitmodules
+COPY llm llm
 
-WORKDIR llm/generate
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 RUN sh gen_linux.sh
 
-FROM --platform=linux/amd64 rocm/dev-centos-7:$ROCM_VERSION-complete AS rocm-build-amd64
-
+FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
 ARG CMAKE_VERSION
-
 RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl \
     && yum update -y \
     && yum remove -y git \
     && yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV LIBRARY_PATH /opt/amdgpu/lib64
-
 ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-x86_64.tar.gz /tmp/cmake-$CMAKE_VERSION.tar.gz
 RUN tar -zx -C /usr --strip-components 1 </tmp/cmake-$CMAKE_VERSION.tar.gz
-
 WORKDIR /go/src/github.com/jmorganca/ollama
-COPY . .
+COPY .git .git
+COPY .gitmodules .gitmodules
+COPY llm llm
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 
-WORKDIR llm/generate
-RUN sh gen_linux.sh
+FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
+ARG CMAKE_VERSION
+RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl \
+    && yum update -y \
+    && yum remove -y git \
+    && yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV LIBRARY_PATH /opt/amdgpu/lib64
+ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-x86_64.tar.gz /tmp/cmake-$CMAKE_VERSION.tar.gz
+RUN tar -zx -C /usr --strip-components 1 </tmp/cmake-$CMAKE_VERSION.tar.gz
+WORKDIR /go/src/github.com/jmorganca/ollama
+COPY .git .git
+COPY .gitmodules .gitmodules
+COPY llm llm
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 
 FROM --platform=linux/amd64 centos:7 AS build-amd64
 ENV CGO_ENABLED 1
@@ -71,9 +88,9 @@ ENV PATH /usr/local/go/bin:$PATH
 
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
-COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cpu/lib llm/llama.cpp/build/linux/cpu/lib
-COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cuda/lib llm/llama.cpp/build/linux/cuda/lib
-COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/rocm/lib llm/llama.cpp/build/linux/rocm/lib
+COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 RUN go build .
 
 FROM --platform=linux/arm64 centos:7 AS build-arm64
@@ -94,8 +111,7 @@ ENV PATH /usr/local/go/bin:$PATH
 
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
-COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cpu/lib llm/llama.cpp/build/linux/cpu/lib
-COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/cuda/lib llm/llama.cpp/build/linux/cuda/lib
+COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 RUN go build .
 
 FROM build-$TARGETARCH
diff --git a/gpu/gpu.go b/gpu/gpu.go
index b51dc9e9..06f6526d 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -145,6 +145,15 @@ func GetGPUInfo() GpuInfo {
 			C.free(unsafe.Pointer(memInfo.err))
 		} else {
 			resp.Library = "rocm"
+			var version C.rocm_version_resp_t
+			C.rocm_get_version(*gpuHandles.rocm, &version)
+			verString := C.GoString(version.str)
+			if version.status == 0 {
+				resp.Variant = "v" + verString
+			} else {
+				log.Printf("failed to look up ROCm version: %s", verString)
+			}
+			C.free(unsafe.Pointer(version.str))
 		}
 	}
 	if resp.Library == "" {
diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c
index 58d1c973..845274e1 100644
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -4,6 +4,8 @@
 
 #include <string.h>
 
+#define ROCM_LOOKUP_SIZE 5
+
 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
   rsmi_status_t ret;
   resp->err = NULL;
@@ -13,11 +15,12 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
   struct lookup {
     char *s;
     void **p;
-  } l[4] = {
+  } l[ROCM_LOOKUP_SIZE] = {
       {"rsmi_init", (void *)&resp->rh.initFn},
       {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
       {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
       {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
+      {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
       // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
   };
 
@@ -32,7 +35,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
     return;
   }
 
-  for (i = 0; i < 4; i++) {
+  for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
     *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
     if (!l[i].p) {
       UNLOAD_LIBRARY(resp->rh.handle);
@@ -103,4 +106,25 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
   return;
 }
 
+void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
+  const int buflen = 256;
+  char buf[buflen + 1];
+  if (h.handle == NULL) {
+    resp->str = strdup("nvml handle not initialized");
+    resp->status = 1;
+    return;
+  }
+  rsmi_version_t ver;
+  rsmi_status_t ret;
+  ret = h.versionGetFn(&ver);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
+    resp->status = 1;
+  } else {
+    snprintf(buf, buflen, "%d", ver.major);
+    resp->status = 0;
+  }
+  resp->str = strdup(buf);
+}
+
 #endif  // __APPLE__
\ No newline at end of file
diff --git a/gpu/gpu_info_rocm.h b/gpu/gpu_info_rocm.h
index 1f74713b..90d9a09f 100644
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -15,12 +15,20 @@ typedef enum rsmi_memory_type {
   RSMI_MEM_TYPE_GTT,
 } rsmi_memory_type_t;
 
+ typedef struct {
+     uint32_t major;     
+     uint32_t minor;     
+     uint32_t patch;     
+     const char *build;  
+ } rsmi_version_t;
+
 typedef struct rocm_handle {
   void *handle;
   rsmi_status_t (*initFn)(uint64_t);
   rsmi_status_t (*shutdownFn)(void);
   rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
   rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
   // rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
 } rocm_handle_t;
 
@@ -29,8 +37,14 @@ typedef struct rocm_init_resp {
   rocm_handle_t rh;
 } rocm_init_resp_t;
 
+typedef struct rocm_version_resp {
+  rsmi_status_t status;
+  char *str; // Contains version or error string if status != 0 
+} rocm_version_resp_t;
+
 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
 void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
+void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp);
 
 #endif  // __GPU_INFO_ROCM_H__
 #endif  // __APPLE__
\ No newline at end of file
diff --git a/gpu/types.go b/gpu/types.go
index abc16dbc..24fa4a24 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -11,5 +11,8 @@ type GpuInfo struct {
 	memInfo
 	Library string `json:"library,omitempty"`
 
+	// Optional variant to select (e.g. versions, cpu feature flags)
+	Variant string `json:"variant,omitempty"`
+
 	// TODO add other useful attributes about the card here for discovery information
 }
diff --git a/llm/dynamic_shim.c b/llm/dynamic_shim.c
index c3e74d4a..ca7c372a 100644
--- a/llm/dynamic_shim.c
+++ b/llm/dynamic_shim.c
@@ -58,7 +58,7 @@ void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
       {"", NULL},
   };
 
-  printf("Lazy loading %s library\n", libPath);
+  printf("loading %s library\n", libPath);
   s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
   if (!s->handle) {
     err->id = -1;
diff --git a/llm/ext_server/README.md b/llm/ext_server/README.md
index ac58d9c8..bfb0d4a6 100644
--- a/llm/ext_server/README.md
+++ b/llm/ext_server/README.md
@@ -1,4 +1,18 @@
 # Extern C Server
 
-This directory contains a thin facade we layer on top of the Llama.cpp server
-to expose `extern C` interfaces to access the functionality through direct API calls in-process
+This directory contains a thin facade we layer on top of the Llama.cpp server to
+expose `extern C` interfaces to access the functionality through direct API
+calls in-process.  The llama.cpp code uses compile time macros to configure GPU
+type along with other settings.  During the `go generate ./...` execution, the
+build will generate one or more copies of the llama.cpp `extern C` server based
+on what GPU libraries are detected to support multiple GPU types as well as CPU
+only support. The Ollama go build then embeds these different servers to support
+different GPUs and settings at runtime.
+
+If you are making changes to the code in this directory, make sure to disable
+caching during your go build to ensure you pick up your changes.  A typical
+iteration cycle from the top of the source tree looks like:
+
+```
+go generate ./... && go build -a .
+```
\ No newline at end of file
diff --git a/llm/ext_server_windows.go b/llm/ext_server_windows.go
index 39b5f096..9d361cf8 100644
--- a/llm/ext_server_windows.go
+++ b/llm/ext_server_windows.go
@@ -1,6 +1,8 @@
 package llm
 
 import (
+	"fmt"
+
 	"github.com/jmorganca/ollama/api"
 )
 
@@ -8,5 +10,6 @@ func newDefaultExtServer(model string, adapters, projectors []string, opts api.O
 	// On windows we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
 	// This ensures we can update the PATH at runtime to get everything loaded
 
-	return newDynamicShimExtServer(AvailableShims["cpu"], model, adapters, projectors, opts)
+	// This should never happen as we'll always try to load one or more cpu dynamic libaries before hitting default
+	return nil, fmt.Errorf("no available default llm library on windows")
 }
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 52081156..99f5b0ac 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -48,23 +48,31 @@ init_vars
 git_module_setup
 apply_patches
 
-#
-# CPU first for the default library
-#
-CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
-BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
+if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
+    #
+    # CPU first for the default library
+    #
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
 
-build
-install
+    build
+    install
 
-# Placeholder to keep go embed happy until we start building dynamic CPU lib variants
-touch ${BUILD_DIR}/lib/dummy.so
+    # Placeholder to keep go embed happy until we start building dynamic CPU lib variants
+    touch ${BUILD_DIR}/lib/dummy.so
+else
+    echo "Skipping CPU generation step as requested"
+fi
 
 if [ -d /usr/local/cuda/lib64/ ]; then
     echo "CUDA libraries detected - building dynamic CUDA library"
     init_vars
+    CUDA_MAJOR=$(ls /usr/local/cuda/lib64/libcudart.so.* | head -1 | cut -f3 -d. || true)
+    if [ -n "${CUDA_MAJOR}" ]; then
+        CUDA_VARIANT=_v${CUDA_MAJOR}
+    fi
     CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda${CUDA_VARIANT}"
     CUDA_LIB_DIR=/usr/local/cuda/lib64
     build
     install
@@ -96,9 +104,12 @@ fi
 
 if [ -d "${ROCM_PATH}" ]; then
     echo "ROCm libraries detected - building dynamic ROCm library"
+    if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
+        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
+    fi
     init_vars
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm${ROCM_VARIANT}"
     build
     install
     gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
diff --git a/llm/llm.go b/llm/llm.go
index 940c0d93..4031cc28 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -19,8 +19,6 @@ type LLM interface {
 	Close()
 }
 
-var AvailableShims = map[string]string{}
-
 func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
@@ -131,7 +129,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 	opts.NumGQA = 0
 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
-	return newLlmServer(library, model, adapters, projectors, opts)
+	gpuInfo := gpu.GetGPUInfo()
+	return newLlmServer(gpuInfo, model, adapters, projectors, opts)
 }
 
 // Give any native cgo implementations an opportunity to initialize
@@ -139,15 +138,18 @@ func Init(workdir string) error {
 	return nativeInit(workdir)
 }
 
-func newLlmServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
-	if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
-		srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, opts)
+func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
+	for _, shim := range getShims(gpuInfo) {
+		if shim == "default" {
+			break
+		}
+		srv, err := newDynamicShimExtServer(shim, model, adapters, projectors, opts)
 		if err == nil {
 			return srv, nil
 		}
-		log.Printf("Failed to load dynamic library %s - falling back to CPU mode %s", library, err)
-		// TODO - update some state to indicate we were unable to load the GPU library for future "info" ux
+		log.Printf("Failed to load dynamic library %s  %s", shim, err)
 	}
 
 	return newDefaultExtServer(model, adapters, projectors, opts)
+
 }
diff --git a/llm/shim.go b/llm/shim.go
new file mode 100644
index 00000000..bbf995f9
--- /dev/null
+++ b/llm/shim.go
@@ -0,0 +1,228 @@
+package llm
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"io/fs"
+	"log"
+	"os"
+	"path/filepath"
+	"runtime"
+	"slices"
+	"strings"
+
+	"github.com/jmorganca/ollama/gpu"
+)
+
+// Shims names may contain an optional variant separated by '_'
+// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
+var availableShims = map[string]string{}
+
+const pathComponentCount = 6
+
+// getShims returns an ordered list of shims to try, starting with the best
+func getShims(gpuInfo gpu.GpuInfo) []string {
+	exactMatch := ""
+	shims := []string{}
+	altShims := []string{}
+	requested := gpuInfo.Library
+	if gpuInfo.Variant != "" {
+		requested += "_" + gpuInfo.Variant
+	}
+	// First try to find an exact match
+	for cmp := range availableShims {
+		if requested == cmp {
+			exactMatch = cmp
+			shims = append(shims, availableShims[cmp])
+			break
+		}
+	}
+	// Then load alternates and sort the list for consistent load ordering
+	for cmp := range availableShims {
+		if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
+			altShims = append(altShims, cmp)
+		}
+	}
+	slices.Sort(altShims)
+	for _, altShim := range altShims {
+		shims = append(shims, availableShims[altShim])
+	}
+
+	// Load up the CPU alternates if not primary requested
+	if gpuInfo.Library != "cpu" {
+		altShims = []string{}
+		for cmp := range availableShims {
+			if strings.Split(cmp, "_")[0] == "cpu" {
+				altShims = append(altShims, cmp)
+			}
+		}
+		slices.Sort(altShims)
+		for _, altShim := range altShims {
+			shims = append(shims, availableShims[altShim])
+		}
+	}
+	// default is always last as the lowest common denominator
+	shims = append(shims, "default")
+	return shims
+}
+
+func rocmShimPresent() bool {
+	for shimName := range availableShims {
+		if strings.HasPrefix(shimName, "rocm") {
+			return true
+		}
+	}
+	return false
+}
+
+func nativeInit(workdir string) error {
+	if runtime.GOOS == "darwin" {
+		err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
+		if err != nil {
+			if err == payloadMissing {
+				// TODO perhaps consider this a hard failure on arm macs?
+				log.Printf("ggml-meta.metal payload missing")
+				return nil
+			}
+			return err
+		}
+		os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
+		return nil
+	}
+
+	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
+	if err != nil {
+		if err == payloadMissing {
+			log.Printf("%s", payloadMissing)
+			return nil
+		}
+		return err
+	}
+	for _, lib := range libs {
+		// The last dir component is the variant name
+		variant := filepath.Base(filepath.Dir(lib))
+		availableShims[variant] = lib
+	}
+
+	if err := verifyDriverAccess(); err != nil {
+		return err
+	}
+
+	// Report which dynamic libraries we have loaded to assist troubleshooting
+	variants := make([]string, len(availableShims))
+	i := 0
+	for variant := range availableShims {
+		variants[i] = variant
+		i++
+	}
+	log.Printf("Dynamic LLM variants %v", variants)
+
+	return nil
+}
+
+func extractDynamicLibs(workDir, glob string) ([]string, error) {
+	files, err := fs.Glob(libEmbed, glob)
+	if err != nil || len(files) == 0 {
+		return nil, payloadMissing
+	}
+	libs := []string{}
+
+	for _, file := range files {
+		pathComps := strings.Split(file, "/")
+		if len(pathComps) != pathComponentCount {
+			log.Printf("unexpected payload components: %v", pathComps)
+			continue
+		}
+		// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
+		// Include the variant in the path to avoid conflicts between multiple server libs
+		targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
+		srcFile, err := libEmbed.Open(file)
+		if err != nil {
+			return nil, fmt.Errorf("read payload %s: %v", file, err)
+		}
+		defer srcFile.Close()
+		if err := os.MkdirAll(targetDir, 0o755); err != nil {
+			return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
+		}
+
+		destFile := filepath.Join(targetDir, filepath.Base(file))
+		if strings.Contains(destFile, "server") {
+			libs = append(libs, destFile)
+		}
+
+		_, err = os.Stat(destFile)
+		switch {
+		case errors.Is(err, os.ErrNotExist):
+			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+			if err != nil {
+				return nil, fmt.Errorf("write payload %s: %v", file, err)
+			}
+			defer destFile.Close()
+			if _, err := io.Copy(destFile, srcFile); err != nil {
+				return nil, fmt.Errorf("copy payload %s: %v", file, err)
+			}
+		case err != nil:
+			return nil, fmt.Errorf("stat payload %s: %v", file, err)
+		}
+	}
+	return libs, nil
+}
+
+func extractPayloadFiles(workDir, glob string) error {
+	files, err := fs.Glob(libEmbed, glob)
+	if err != nil || len(files) == 0 {
+		return payloadMissing
+	}
+
+	for _, file := range files {
+		srcFile, err := libEmbed.Open(file)
+		if err != nil {
+			return fmt.Errorf("read payload %s: %v", file, err)
+		}
+		defer srcFile.Close()
+		if err := os.MkdirAll(workDir, 0o755); err != nil {
+			return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
+		}
+
+		destFile := filepath.Join(workDir, filepath.Base(file))
+		_, err = os.Stat(destFile)
+		switch {
+		case errors.Is(err, os.ErrNotExist):
+			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+			if err != nil {
+				return fmt.Errorf("write payload %s: %v", file, err)
+			}
+			defer destFile.Close()
+			if _, err := io.Copy(destFile, srcFile); err != nil {
+				return fmt.Errorf("copy payload %s: %v", file, err)
+			}
+		case err != nil:
+			return fmt.Errorf("stat payload %s: %v", file, err)
+		}
+	}
+	return nil
+}
+
+func verifyDriverAccess() error {
+	if runtime.GOOS != "linux" {
+		return nil
+	}
+	// Only check ROCm access if we have the dynamic lib loaded
+	if rocmShimPresent() {
+		// Verify we have permissions - either running as root, or we have group access to the driver
+		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
+		if err != nil {
+			if errors.Is(err, fs.ErrPermission) {
+				return fmt.Errorf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
+			} else if errors.Is(err, fs.ErrNotExist) {
+				// expected behavior without a radeon card
+				return nil
+			}
+
+			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
+		}
+		fd.Close()
+	}
+	return nil
+}
diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go
index 3baafd1e..9ef8ef96 100644
--- a/llm/shim_darwin.go
+++ b/llm/shim_darwin.go
@@ -2,13 +2,7 @@ package llm
 
 import (
 	"embed"
-	"errors"
 	"fmt"
-	"io"
-	"io/fs"
-	"log"
-	"os"
-	"path/filepath"
 
 	"github.com/jmorganca/ollama/api"
 )
@@ -20,52 +14,3 @@ func newDynamicShimExtServer(library, model string, adapters, projectors []strin
 	// should never happen...
 	return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
 }
-
-func nativeInit(workdir string) error {
-	err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
-	if err != nil {
-		if err == payloadMissing {
-			// TODO perhaps consider this a hard failure on arm macs?
-			log.Printf("ggml-meta.metal payload missing")
-			return nil
-		}
-		return err
-	}
-	os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
-	return nil
-}
-
-func extractPayloadFiles(workDir, glob string) error {
-	files, err := fs.Glob(libEmbed, glob)
-	if err != nil || len(files) == 0 {
-		return payloadMissing
-	}
-
-	for _, file := range files {
-		srcFile, err := libEmbed.Open(file)
-		if err != nil {
-			return fmt.Errorf("read payload %s: %v", file, err)
-		}
-		defer srcFile.Close()
-		if err := os.MkdirAll(workDir, 0o755); err != nil {
-			return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
-		}
-
-		destFile := filepath.Join(workDir, filepath.Base(file))
-		_, err = os.Stat(destFile)
-		switch {
-		case errors.Is(err, os.ErrNotExist):
-			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-			if err != nil {
-				return fmt.Errorf("write payload %s: %v", file, err)
-			}
-			defer destFile.Close()
-			if _, err := io.Copy(destFile, srcFile); err != nil {
-				return fmt.Errorf("copy payload %s: %v", file, err)
-			}
-		case err != nil:
-			return fmt.Errorf("stat payload %s: %v", file, err)
-		}
-	}
-	return nil
-}
diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go
index dca7b38d..102f059c 100644
--- a/llm/shim_ext_server.go
+++ b/llm/shim_ext_server.go
@@ -11,14 +11,9 @@ package llm
 import "C"
 import (
 	"context"
-	"errors"
 	"fmt"
-	"io"
-	"io/fs"
 	"log"
-	"os"
 	"path/filepath"
-	"strings"
 	"sync"
 	"unsafe"
 
@@ -34,8 +29,6 @@ type shimExtServer struct {
 var shimMutex sync.Mutex
 var llm *shimExtServer
 
-const pathComponentCount = 6
-
 func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
 	C.dynamic_shim_llama_server_init(llm.s, sparams, err)
 }
@@ -112,82 +105,3 @@ func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float6
 func (llm *shimExtServer) Close() {
 	close(llm)
 }
-
-func nativeInit(workdir string) error {
-	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
-	if err != nil {
-		if err == payloadMissing {
-			log.Printf("%s", payloadMissing)
-			return nil
-		}
-		return err
-	}
-	for _, lib := range libs {
-		// The last dir component is the variant name
-		variant := filepath.Base(filepath.Dir(lib))
-		AvailableShims[variant] = lib
-	}
-
-	if err := verifyDriverAccess(); err != nil {
-		return err
-	}
-
-	// Report which dynamic libraries we have loaded to assist troubleshooting
-	variants := make([]string, len(AvailableShims))
-	i := 0
-	for variant := range AvailableShims {
-		variants[i] = variant
-		i++
-	}
-	log.Printf("Dynamic LLM variants %v", variants)
-
-	return nil
-}
-
-func extractDynamicLibs(workDir, glob string) ([]string, error) {
-	files, err := fs.Glob(libEmbed, glob)
-	if err != nil || len(files) == 0 {
-		return nil, payloadMissing
-	}
-	libs := []string{}
-
-	for _, file := range files {
-		pathComps := strings.Split(file, "/")
-		if len(pathComps) != pathComponentCount {
-			log.Printf("unexpected payload components: %v", pathComps)
-			continue
-		}
-		// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
-		// Include the variant in the path to avoid conflicts between multiple server libs
-		targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
-		srcFile, err := libEmbed.Open(file)
-		if err != nil {
-			return nil, fmt.Errorf("read payload %s: %v", file, err)
-		}
-		defer srcFile.Close()
-		if err := os.MkdirAll(targetDir, 0o755); err != nil {
-			return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
-		}
-
-		destFile := filepath.Join(targetDir, filepath.Base(file))
-		if strings.Contains(destFile, "server") {
-			libs = append(libs, destFile)
-		}
-
-		_, err = os.Stat(destFile)
-		switch {
-		case errors.Is(err, os.ErrNotExist):
-			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-			if err != nil {
-				return nil, fmt.Errorf("write payload %s: %v", file, err)
-			}
-			defer destFile.Close()
-			if _, err := io.Copy(destFile, srcFile); err != nil {
-				return nil, fmt.Errorf("copy payload %s: %v", file, err)
-			}
-		case err != nil:
-			return nil, fmt.Errorf("stat payload %s: %v", file, err)
-		}
-	}
-	return libs, nil
-}
diff --git a/llm/shim_ext_server_linux.go b/llm/shim_ext_server_linux.go
index e0ad5da4..a9a8aca2 100644
--- a/llm/shim_ext_server_linux.go
+++ b/llm/shim_ext_server_linux.go
@@ -2,9 +2,6 @@ package llm
 
 import (
 	"embed"
-	"errors"
-	"fmt"
-	"io/fs"
 	"log"
 	"os"
 	"strings"
@@ -24,23 +21,3 @@ func updatePath(dir string) {
 	log.Printf("Updating PATH to %s", newPath)
 	os.Setenv("PATH", newPath)
 }
-
-func verifyDriverAccess() error {
-	// Only check ROCm access if we have the dynamic lib loaded
-	if _, rocmPresent := AvailableShims["rocm"]; rocmPresent {
-		// Verify we have permissions - either running as root, or we have group access to the driver
-		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
-		if err != nil {
-			if errors.Is(err, fs.ErrPermission) {
-				return fmt.Errorf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
-			} else if errors.Is(err, fs.ErrNotExist) {
-				// expected behavior without a radeon card
-				return nil
-			}
-
-			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
-		}
-		fd.Close()
-	}
-	return nil
-}
diff --git a/llm/shim_ext_server_windows.go b/llm/shim_ext_server_windows.go
index e95c8afa..c218c6f3 100644
--- a/llm/shim_ext_server_windows.go
+++ b/llm/shim_ext_server_windows.go
@@ -29,8 +29,3 @@ func updatePath(dir string) {
 	log.Printf("Updating PATH to %s", newPath)
 	os.Setenv("PATH", newPath)
 }
-
-func verifyDriverAccess() error {
-	// TODO if applicable
-	return nil
-}
diff --git a/llm/shim_test.go b/llm/shim_test.go
new file mode 100644
index 00000000..7a1c5acc
--- /dev/null
+++ b/llm/shim_test.go
@@ -0,0 +1,61 @@
+package llm
+
+import (
+	"testing"
+
+	"github.com/jmorganca/ollama/gpu"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestGetShims(t *testing.T) {
+	availableShims = map[string]string{
+		"cpu": "X_cpu",
+	}
+	assert.Equal(t, false, rocmShimPresent())
+	res := getShims(gpu.GpuInfo{Library: "cpu"})
+	assert.Len(t, res, 2)
+	assert.Equal(t, availableShims["cpu"], res[0])
+	assert.Equal(t, "default", res[1])
+
+	availableShims = map[string]string{
+		"rocm_v5": "X_rocm_v5",
+		"rocm_v6": "X_rocm_v6",
+		"cpu":     "X_cpu",
+	}
+	assert.Equal(t, true, rocmShimPresent())
+	res = getShims(gpu.GpuInfo{Library: "rocm"})
+	assert.Len(t, res, 4)
+	assert.Equal(t, availableShims["rocm_v5"], res[0])
+	assert.Equal(t, availableShims["rocm_v6"], res[1])
+	assert.Equal(t, availableShims["cpu"], res[2])
+	assert.Equal(t, "default", res[3])
+
+	res = getShims(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
+	assert.Len(t, res, 4)
+	assert.Equal(t, availableShims["rocm_v6"], res[0])
+	assert.Equal(t, availableShims["rocm_v5"], res[1])
+	assert.Equal(t, availableShims["cpu"], res[2])
+	assert.Equal(t, "default", res[3])
+
+	res = getShims(gpu.GpuInfo{Library: "cuda"})
+	assert.Len(t, res, 2)
+	assert.Equal(t, availableShims["cpu"], res[0])
+	assert.Equal(t, "default", res[1])
+
+	res = getShims(gpu.GpuInfo{Library: "default"})
+	assert.Len(t, res, 2)
+	assert.Equal(t, availableShims["cpu"], res[0])
+	assert.Equal(t, "default", res[1])
+
+	availableShims = map[string]string{
+		"rocm": "X_rocm_v5",
+		"cpu":  "X_cpu",
+	}
+	assert.Equal(t, true, rocmShimPresent())
+	res = getShims(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
+	assert.Len(t, res, 3)
+	assert.Equal(t, availableShims["rocm"], res[0])
+	assert.Equal(t, availableShims["cpu"], res[1])
+	assert.Equal(t, "default", res[2])
+
+}

From 052b33b81bef9482dc8dd8306c197c0d5aa1e4a6 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sat, 6 Jan 2024 16:46:55 -0800
Subject: [PATCH 2/4] DRY out the Dockefile.build

---
 Dockerfile.build         | 112 +++++++++++++++------------------------
 scripts/rh_linux_deps.sh |  43 +++++++++++++++
 2 files changed, 86 insertions(+), 69 deletions(-)
 create mode 100644 scripts/rh_linux_deps.sh

diff --git a/Dockerfile.build b/Dockerfile.build
index 9ba44398..22ad29d3 100644
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -2,90 +2,75 @@ ARG GOLANG_VERSION=1.21.3
 ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION=11.3.1
 
-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
-
-ARG CMAKE_VERSION
-
-RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl \
-    && yum update -y \
-    && yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-
-ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-x86_64.tar.gz /tmp/cmake-$CMAKE_VERSION.tar.gz
-RUN tar -zx -C /usr --strip-components 1 </tmp/cmake-$CMAKE_VERSION.tar.gz
-
-WORKDIR /go/src/github.com/jmorganca/ollama
+# Copy the minimal context we need to run the generate scripts
+FROM scratch AS llm-code
 COPY .git .git
 COPY .gitmodules .gitmodules
 COPY llm llm
 
+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
-RUN sh gen_linux.sh
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
-
 ARG CMAKE_VERSION
-
-RUN dnf install -y git cmake
-
-WORKDIR /go/src/github.com/jmorganca/ollama
-COPY .git .git
-COPY .gitmodules .gitmodules
-COPY llm llm
-
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
-RUN sh gen_linux.sh
+RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 
 FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
 ARG CMAKE_VERSION
-RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl \
-    && yum update -y \
-    && yum remove -y git \
-    && yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV LIBRARY_PATH /opt/amdgpu/lib64
-ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-x86_64.tar.gz /tmp/cmake-$CMAKE_VERSION.tar.gz
-RUN tar -zx -C /usr --strip-components 1 </tmp/cmake-$CMAKE_VERSION.tar.gz
-WORKDIR /go/src/github.com/jmorganca/ollama
-COPY .git .git
-COPY .gitmodules .gitmodules
-COPY llm llm
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 
 FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
 ARG CMAKE_VERSION
-RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl \
-    && yum update -y \
-    && yum remove -y git \
-    && yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV LIBRARY_PATH /opt/amdgpu/lib64
-ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-x86_64.tar.gz /tmp/cmake-$CMAKE_VERSION.tar.gz
-RUN tar -zx -C /usr --strip-components 1 </tmp/cmake-$CMAKE_VERSION.tar.gz
-WORKDIR /go/src/github.com/jmorganca/ollama
-COPY .git .git
-COPY .gitmodules .gitmodules
-COPY llm llm
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
 WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 
-FROM --platform=linux/amd64 centos:7 AS build-amd64
-ENV CGO_ENABLED 1
-
+FROM --platform=linux/amd64 centos:7 AS cpu-build-amd64
+ARG CMAKE_VERSION
 ARG GOLANG_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+RUN sh gen_linux.sh
+
+FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
+ARG CMAKE_VERSION
+ARG GOLANG_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
+WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
+RUN sh gen_linux.sh
+
+
+FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
+ENV CGO_ENABLED 1
 ARG GOFLAGS
 ARG CGO_FLAGS
-
-RUN yum install -y centos-release-scl \
-    && yum update -y \
-    && yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-
-ADD https://dl.google.com/go/go$GOLANG_VERSION.linux-amd64.tar.gz /tmp/go-$GOLANG_VERSION.tar.gz
-RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go-$GOLANG_VERSION.tar.gz
-ENV PATH /usr/local/go/bin:$PATH
-
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
 COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
@@ -93,22 +78,11 @@ COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp
 COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 RUN go build .
 
-FROM --platform=linux/arm64 centos:7 AS build-arm64
+FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
 ENV CGO_ENABLED 1
-
 ARG GOLANG_VERSION
 ARG GOFLAGS
 ARG CGO_FLAGS
-
-RUN yum install -y centos-release-scl \
-    && yum update -y \
-    && yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-
-ADD https://dl.google.com/go/go$GOLANG_VERSION.linux-arm64.tar.gz /tmp/go-$GOLANG_VERSION.tar.gz
-RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go-$GOLANG_VERSION.tar.gz
-ENV PATH /usr/local/go/bin:$PATH
-
 WORKDIR /go/src/github.com/jmorganca/ollama
 COPY . .
 COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
diff --git a/scripts/rh_linux_deps.sh b/scripts/rh_linux_deps.sh
new file mode 100644
index 00000000..ec6b20a0
--- /dev/null
+++ b/scripts/rh_linux_deps.sh
@@ -0,0 +1,43 @@
+#!/bin/sh
+
+# Script for common Dockerfile dependency installation in redhat linux based images
+
+set -ex
+MACHINE=$(uname -m)
+
+if grep -i "centos" /etc/system-release >/dev/null; then
+    # Centos 7 derivatives have too old of a git version to run our generate script
+    # uninstall and ignore failures
+    yum remove -y git
+    yum -y install epel-release centos-release-scl
+    yum -y install dnf
+    if [ "${MACHINE}" = "x86_64" ]; then
+        yum -y install https://repo.ius.io/ius-release-el7.rpm
+        dnf install -y git236
+    else
+        dnf install -y rh-git227-git
+        ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
+    fi
+    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++
+elif grep -i "rocky" /etc/system-release >/dev/null; then
+    dnf install -y git gcc-toolset-10-gcc gcc-toolset-10-gcc-c++
+else
+    echo "ERROR Unexpected distro"
+    exit 1
+fi
+
+if [ -n "${CMAKE_VERSION}" ]; then
+    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
+fi
+
+if [ -n "${GOLANG_VERSION}" ]; then
+    if [ "${MACHINE}" = "x86_64" ]; then
+        GO_ARCH="amd64"
+    else
+        GO_ARCH="arm64"
+    fi
+    mkdir -p /usr/local
+    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-${GO_ARCH}.tar.gz | tar xz -C /usr/local
+    ln -s /usr/local/go/bin/go /usr/local/bin/go
+    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt
+fi

From d88c527be392ff4a05648f6e2cbd8f69241714ca Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sun, 7 Jan 2024 15:48:05 -0800
Subject: [PATCH 3/4] Build multiple CPU variants and pick the best

This reduces the built-in linux version to not use any vector extensions
which enables the resulting builds to run under Rosetta on MacOS in
Docker.  Then at runtime it checks for the actual CPU vector
extensions and loads the best CPU library available
---
 Dockerfile.build                             |  2 +
 docs/development.md                          | 16 +++++
 docs/troubleshooting.md                      | 35 +++++++++-
 gpu/gpu.go                                   |  8 +--
 gpu/gpu_darwin.go                            |  5 ++
 llm/{ext_server_windows.go => ext_server.go} |  6 +-
 llm/ext_server_common.go                     |  8 +--
 llm/ext_server_default.go                    |  4 +-
 llm/generate/gen_common.sh                   | 10 +++
 llm/generate/gen_linux.sh                    | 69 +++++++++++++++++---
 llm/llm.go                                   | 17 ++++-
 llm/shim.go                                  | 57 ++++++++++------
 llm/shim_ext_server_linux.go                 |  8 +--
 llm/shim_test.go                             | 21 ++----
 scripts/build_linux.sh                       |  2 +-
 15 files changed, 202 insertions(+), 66 deletions(-)
 rename llm/{ext_server_windows.go => ext_server.go} (66%)

diff --git a/Dockerfile.build b/Dockerfile.build
index 22ad29d3..96b06138 100644
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -49,6 +49,7 @@ RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 FROM --platform=linux/amd64 centos:7 AS cpu-build-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
+ARG OLLAMA_CUSTOM_CPU_DEFS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
@@ -59,6 +60,7 @@ RUN sh gen_linux.sh
 FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
+ARG OLLAMA_CUSTOM_CPU_DEFS
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
diff --git a/docs/development.md b/docs/development.md
index d36a7ed7..7e7fbc3b 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -76,6 +76,22 @@ go build .
 
 ROCm requires elevated privileges to access the GPU at runtime.  On most distros you can add your user account to the `render` group, or run as root.
 
+#### Advanced CPU Settings
+
+By default, running `go generate ./...` will compile a few different variations
+of the LLM library based on common CPU families and vector math capabilities,
+including a lowest-common-denominator which should run on almost any 64 bit CPU
+somewhat slowly.  At runtime, Ollama will auto-detect the optimal variation to
+load.  If you would like to build a CPU-based build customized for your
+processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
+like to use.  For example, to compile an optimized binary for an Intel i9-9880H,
+you might use:
+
+```
+OLLAMA_CUSTOM_CPU_DEFS="-DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_F16C=on -DLLAMA_FMA=on" go generate ./...
+go build .
+```
+
 #### Containerized Linux Build
 
 If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.  The resulting binary is placed in `./dist`
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index fe842bbd..c1a8eec9 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -16,7 +16,38 @@ If manually running `ollama serve` in a terminal, the logs will be on that termi
 
 Join the [Discord](https://discord.gg/ollama) for help interpreting the logs.
 
+## LLM libraries
+
+Ollama includes multiple LLM libraries compiled for different GPUs and CPU
+vector features.  Ollama tries to pick the best one based on the capabilities of
+your system.  If this autodetection has problems, or you run into other problems
+(e.g. crashes in your GPU) you can workaround this by forcing a specific LLM
+library.  `cpu_avx2` will perform the best, followed by `cpu_avx` an the slowest
+but most compatible is `cpu`.  Rosetta emulation under MacOS will work with the
+`cpu` library. 
+
+In the server log, you will see a message that looks something like this (varies
+from release to release):
+
+```
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
+```
+
+**Experimental LLM Library Override**
+
+You can set OLLAMA_LLM_LIBRARY to any of the available LLM libraries to bypass
+autodetection, so for example, if you have a CUDA card, but want to force the
+CPU LLM library with AVX2 vector support, use:
+
+```
+OLLAMA_LLM_LIBRARY="cpu_avx2" ollama serve
+```
+
+You can see what features your CPU has with the following.  
+```
+cat /proc/cpuinfo| grep flags  | head -1
+```
+
 ## Known issues
 
-
-* `signal: illegal instruction (core dumped)`: Ollama requires AVX support from the CPU. This was introduced in 2011 and CPUs started offering it in 2012. CPUs from before that and some lower end CPUs after that may not have AVX support and thus are not supported by Ollama. Some users have had luck with building Ollama on their machines disabling the need for AVX.
+* N/A
\ No newline at end of file
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 06f6526d..9b3d51b6 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -158,12 +158,8 @@ func GetGPUInfo() GpuInfo {
 	}
 	if resp.Library == "" {
 		C.cpu_check_ram(&memInfo)
-		// In the future we may offer multiple CPU variants to tune CPU features
-		if runtime.GOOS == "windows" {
-			resp.Library = "cpu"
-		} else {
-			resp.Library = "default"
-		}
+		resp.Library = "cpu"
+		resp.Variant = GetCPUVariant()
 	}
 	if memInfo.err != nil {
 		log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go
index 23c95e36..79645285 100644
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -49,3 +49,8 @@ func getCPUMem() (memInfo, error) {
 func nativeInit() error {
 	return nil
 }
+
+func GetCPUVariant() string {
+	// We don't yet have CPU based builds for Darwin...
+	return ""
+}
diff --git a/llm/ext_server_windows.go b/llm/ext_server.go
similarity index 66%
rename from llm/ext_server_windows.go
rename to llm/ext_server.go
index 9d361cf8..c8a5f0b9 100644
--- a/llm/ext_server_windows.go
+++ b/llm/ext_server.go
@@ -1,3 +1,5 @@
+//go:build !darwin
+
 package llm
 
 import (
@@ -7,9 +9,9 @@ import (
 )
 
 func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
-	// On windows we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
+	// On windows and linux we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
 	// This ensures we can update the PATH at runtime to get everything loaded
 
 	// This should never happen as we'll always try to load one or more cpu dynamic libaries before hitting default
-	return nil, fmt.Errorf("no available default llm library on windows")
+	return nil, fmt.Errorf("no available default llm library")
 }
diff --git a/llm/ext_server_common.go b/llm/ext_server_common.go
index 9a331742..b10ac60b 100644
--- a/llm/ext_server_common.go
+++ b/llm/ext_server_common.go
@@ -15,12 +15,6 @@ package llm
 #cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a
 #cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a
 #cgo linux CFLAGS: -D_GNU_SOURCE
-#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
-#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libext_server.a
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libcommon.a
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libllama.a
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libggml_static.a
 #cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
 #cgo linux windows LDFLAGS: -lpthread
 
@@ -43,6 +37,8 @@ import (
 	"github.com/jmorganca/ollama/api"
 )
 
+// TODO switch Linux to always be dynamic
+// If that works out, then look at the impact of doing the same for Mac
 type extServer interface {
 	LLM
 	llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
diff --git a/llm/ext_server_default.go b/llm/ext_server_default.go
index 05287383..31f05fb6 100644
--- a/llm/ext_server_default.go
+++ b/llm/ext_server_default.go
@@ -1,4 +1,4 @@
-//go:build !windows
+//go:build darwin
 
 package llm
 
@@ -14,6 +14,8 @@ import (
 	"github.com/jmorganca/ollama/api"
 )
 
+// TODO - explore shifting Darwin to a dynamic loading pattern for consistency with Linux and Windows
+
 type llamaExtServer struct {
 	api.Options
 }
diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh
index ac91f1aa..d7bafa5b 100644
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -51,6 +51,16 @@ install() {
     cp ${BUILD_DIR}/libggml_static.a ${BUILD_DIR}/lib
 }
 
+link_server_lib() {
+    gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
+        -Wl,--whole-archive \
+        ${BUILD_DIR}/lib/libext_server.a \
+        -Wl,--no-whole-archive \
+        ${BUILD_DIR}/lib/libcommon.a \
+        ${BUILD_DIR}/lib/libllama.a
+
+}
+
 # Keep the local tree clean after we're done with the build
 cleanup() {
     (cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 99f5b0ac..3fec7e6b 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -49,17 +49,68 @@ git_module_setup
 apply_patches
 
 if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
-    #
-    # CPU first for the default library
-    #
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
+    # Users building from source can tune the exact flags we pass to cmake for configuring
+    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
+    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
+        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
+        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
+        echo "Building custom CPU"
+        build
+        install
+        link_server_lib
+    else
+        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
+        # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
+        # -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
+        # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
+        # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
+        # Note: the following seem to yield slower results than AVX2 - ymmv
+        # -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
+        # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
+        # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
 
-    build
-    install
+        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
+        #
+        # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
+        #
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
+        echo "Building LCD CPU"
+        build
+        install
+        link_server_lib
 
-    # Placeholder to keep go embed happy until we start building dynamic CPU lib variants
-    touch ${BUILD_DIR}/lib/dummy.so
+        #
+        # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
+        # Approximately 400% faster than LCD on same CPU
+        #
+        init_vars
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx"
+        echo "Building AVX CPU"
+        build
+        install
+        link_server_lib
+
+        #
+        # ~2013 CPU Dynamic library
+        # Approximately 10% faster than AVX on same CPU
+        #
+        init_vars
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+        BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx2"
+        echo "Building AVX2 CPU"
+        build
+        install
+        link_server_lib
+        gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
+            -Wl,--whole-archive \
+            ${BUILD_DIR}/lib/libext_server.a \
+            -Wl,--no-whole-archive \
+            ${BUILD_DIR}/lib/libcommon.a \
+            ${BUILD_DIR}/lib/libllama.a
+    fi
 else
     echo "Skipping CPU generation step as requested"
 fi
diff --git a/llm/llm.go b/llm/llm.go
index 4031cc28..05230b09 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -139,7 +139,22 @@ func Init(workdir string) error {
 }
 
 func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
-	for _, shim := range getShims(gpuInfo) {
+	shims := getShims(gpuInfo)
+
+	// Check to see if the user has requested a specific library instead of auto-detecting
+	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
+	if demandLib != "" {
+		libPath := availableShims[demandLib]
+		if libPath == "" {
+			log.Printf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib)
+		} else {
+			log.Printf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib)
+			shims = []string{libPath}
+		}
+	}
+
+	for _, shim := range shims {
+		// TODO - only applies on Darwin (switch to fully dynamic there too...)
 		if shim == "default" {
 			break
 		}
diff --git a/llm/shim.go b/llm/shim.go
index bbf995f9..e68a8ec3 100644
--- a/llm/shim.go
+++ b/llm/shim.go
@@ -15,14 +15,20 @@ import (
 	"github.com/jmorganca/ollama/gpu"
 )
 
-// Shims names may contain an optional variant separated by '_'
+// Libraries names may contain an optional variant separated by '_'
 // For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
+// Any library without a variant is the lowest common denominator
 var availableShims = map[string]string{}
 
 const pathComponentCount = 6
 
 // getShims returns an ordered list of shims to try, starting with the best
 func getShims(gpuInfo gpu.GpuInfo) []string {
+	// Short circuit if we know we're using the default built-in (darwin only)
+	if gpuInfo.Library == "default" {
+		return []string{"default"}
+	}
+
 	exactMatch := ""
 	shims := []string{}
 	altShims := []string{}
@@ -30,30 +36,18 @@ func getShims(gpuInfo gpu.GpuInfo) []string {
 	if gpuInfo.Variant != "" {
 		requested += "_" + gpuInfo.Variant
 	}
-	// First try to find an exact match
+	// Try to find an exact match
 	for cmp := range availableShims {
 		if requested == cmp {
 			exactMatch = cmp
-			shims = append(shims, availableShims[cmp])
+			shims = []string{availableShims[cmp]}
 			break
 		}
 	}
-	// Then load alternates and sort the list for consistent load ordering
-	for cmp := range availableShims {
-		if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
-			altShims = append(altShims, cmp)
-		}
-	}
-	slices.Sort(altShims)
-	for _, altShim := range altShims {
-		shims = append(shims, availableShims[altShim])
-	}
-
-	// Load up the CPU alternates if not primary requested
+	// Then for GPUs load alternates and sort the list for consistent load ordering
 	if gpuInfo.Library != "cpu" {
-		altShims = []string{}
 		for cmp := range availableShims {
-			if strings.Split(cmp, "_")[0] == "cpu" {
+			if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
 				altShims = append(altShims, cmp)
 			}
 		}
@@ -62,8 +56,30 @@ func getShims(gpuInfo gpu.GpuInfo) []string {
 			shims = append(shims, availableShims[altShim])
 		}
 	}
-	// default is always last as the lowest common denominator
-	shims = append(shims, "default")
+
+	// Load up the best CPU variant if not primary requested
+	if gpuInfo.Library != "cpu" {
+		variant := gpu.GetCPUVariant()
+		// If no variant, then we fall back to default
+		// If we have a variant, try that if we find an exact match
+		// Attempting to run the wrong CPU instructions will panic the
+		// process
+		if variant != "" {
+			for cmp := range availableShims {
+				if cmp == "cpu_"+variant {
+					shims = append(shims, availableShims[cmp])
+					break
+				}
+			}
+		} else {
+			shims = append(shims, availableShims["cpu"])
+		}
+	}
+
+	// Finaly, if we didn't find any matches, LCD CPU FTW
+	if len(shims) == 0 {
+		shims = []string{availableShims["cpu"]}
+	}
 	return shims
 }
 
@@ -116,7 +132,8 @@ func nativeInit(workdir string) error {
 		variants[i] = variant
 		i++
 	}
-	log.Printf("Dynamic LLM variants %v", variants)
+	log.Printf("Dynamic LLM libraries %v", variants)
+	log.Printf("Override detection logic by setting OLLAMA_LLM_LIBRARY")
 
 	return nil
 }
diff --git a/llm/shim_ext_server_linux.go b/llm/shim_ext_server_linux.go
index a9a8aca2..e4bfd15e 100644
--- a/llm/shim_ext_server_linux.go
+++ b/llm/shim_ext_server_linux.go
@@ -11,13 +11,13 @@ import (
 var libEmbed embed.FS
 
 func updatePath(dir string) {
-	pathComponents := strings.Split(os.Getenv("PATH"), ":")
+	pathComponents := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
 	for _, comp := range pathComponents {
 		if comp == dir {
 			return
 		}
 	}
-	newPath := strings.Join(append(pathComponents, dir), ":")
-	log.Printf("Updating PATH to %s", newPath)
-	os.Setenv("PATH", newPath)
+	newPath := strings.Join(append([]string{dir}, pathComponents...), ":")
+	log.Printf("Updating LD_LIBRARY_PATH to %s", newPath)
+	os.Setenv("LD_LIBRARY_PATH", newPath)
 }
diff --git a/llm/shim_test.go b/llm/shim_test.go
index 7a1c5acc..8d49ce14 100644
--- a/llm/shim_test.go
+++ b/llm/shim_test.go
@@ -13,9 +13,8 @@ func TestGetShims(t *testing.T) {
 	}
 	assert.Equal(t, false, rocmShimPresent())
 	res := getShims(gpu.GpuInfo{Library: "cpu"})
-	assert.Len(t, res, 2)
+	assert.Len(t, res, 1)
 	assert.Equal(t, availableShims["cpu"], res[0])
-	assert.Equal(t, "default", res[1])
 
 	availableShims = map[string]string{
 		"rocm_v5": "X_rocm_v5",
@@ -24,28 +23,24 @@ func TestGetShims(t *testing.T) {
 	}
 	assert.Equal(t, true, rocmShimPresent())
 	res = getShims(gpu.GpuInfo{Library: "rocm"})
-	assert.Len(t, res, 4)
+	assert.Len(t, res, 3)
 	assert.Equal(t, availableShims["rocm_v5"], res[0])
 	assert.Equal(t, availableShims["rocm_v6"], res[1])
 	assert.Equal(t, availableShims["cpu"], res[2])
-	assert.Equal(t, "default", res[3])
 
 	res = getShims(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
-	assert.Len(t, res, 4)
+	assert.Len(t, res, 3)
 	assert.Equal(t, availableShims["rocm_v6"], res[0])
 	assert.Equal(t, availableShims["rocm_v5"], res[1])
 	assert.Equal(t, availableShims["cpu"], res[2])
-	assert.Equal(t, "default", res[3])
 
 	res = getShims(gpu.GpuInfo{Library: "cuda"})
-	assert.Len(t, res, 2)
+	assert.Len(t, res, 1)
 	assert.Equal(t, availableShims["cpu"], res[0])
-	assert.Equal(t, "default", res[1])
 
 	res = getShims(gpu.GpuInfo{Library: "default"})
-	assert.Len(t, res, 2)
-	assert.Equal(t, availableShims["cpu"], res[0])
-	assert.Equal(t, "default", res[1])
+	assert.Len(t, res, 1)
+	assert.Equal(t, "default", res[0])
 
 	availableShims = map[string]string{
 		"rocm": "X_rocm_v5",
@@ -53,9 +48,7 @@ func TestGetShims(t *testing.T) {
 	}
 	assert.Equal(t, true, rocmShimPresent())
 	res = getShims(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
-	assert.Len(t, res, 3)
+	assert.Len(t, res, 2)
 	assert.Equal(t, availableShims["rocm"], res[0])
 	assert.Equal(t, availableShims["cpu"], res[1])
-	assert.Equal(t, "default", res[2])
-
 }
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index 846103ea..582899f7 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -9,7 +9,7 @@ BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 mkdir -p dist
 
 for TARGETARCH in ${BUILD_ARCH}; do
-    docker build --platform=linux/$TARGETARCH --build-arg=GOFLAGS --build-arg=CGO_CFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
+    docker build --platform=linux/$TARGETARCH --build-arg=GOFLAGS --build-arg=CGO_CFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS -f Dockerfile.build -t builder:$TARGETARCH .
     docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
     docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
     docker rm builder-$TARGETARCH

From 39928a42e8e2b68d5d904c70c4bd07f849e1b76d Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 9 Jan 2024 20:29:58 -0800
Subject: [PATCH 4/4] Always dynamically load the llm server library

This switches darwin to dynamic loading, and refactors the code now that no
static linking of the library is used on any platform
---
 go.mod                                        |   2 +-
 gpu/cpu_common.go                             |  21 +++
 gpu/gpu_darwin.go                             |  18 ++-
 gpu/gpu_test.go                               |   2 +-
 llm/{dynamic_shim.c => dyn_ext_server.c}      |  26 ++--
 ...ext_server_common.go => dyn_ext_server.go} | 132 +++++++++++-------
 llm/{dynamic_shim.h => dyn_ext_server.h}      |  24 ++--
 llm/ext_server.go                             |  17 ---
 llm/ext_server_default.go                     |  82 -----------
 llm/generate/gen_darwin.sh                    |  12 ++
 llm/generate/gen_linux.sh                     |   6 -
 llm/generate/gen_windows.ps1                  |  28 +++-
 llm/llm.go                                    |  23 ++-
 llm/{shim.go => payload_common.go}            |  51 ++++---
 llm/payload_darwin.go                         |   8 ++
 llm/payload_linux.go                          |   8 ++
 llm/payload_test.go                           |  54 +++++++
 llm/payload_windows.go                        |   8 ++
 llm/shim_darwin.go                            |  16 ---
 llm/shim_ext_server.go                        | 107 --------------
 llm/shim_ext_server_linux.go                  |  23 ---
 llm/shim_ext_server_windows.go                |  31 ----
 llm/shim_test.go                              |  54 -------
 23 files changed, 290 insertions(+), 463 deletions(-)
 create mode 100644 gpu/cpu_common.go
 rename llm/{dynamic_shim.c => dyn_ext_server.c} (83%)
 rename llm/{ext_server_common.go => dyn_ext_server.go} (72%)
 rename llm/{dynamic_shim.h => dyn_ext_server.h} (75%)
 delete mode 100644 llm/ext_server.go
 delete mode 100644 llm/ext_server_default.go
 rename llm/{shim.go => payload_common.go} (84%)
 create mode 100644 llm/payload_darwin.go
 create mode 100644 llm/payload_linux.go
 create mode 100644 llm/payload_test.go
 create mode 100644 llm/payload_windows.go
 delete mode 100644 llm/shim_darwin.go
 delete mode 100644 llm/shim_ext_server.go
 delete mode 100644 llm/shim_ext_server_linux.go
 delete mode 100644 llm/shim_ext_server_windows.go
 delete mode 100644 llm/shim_test.go

diff --git a/go.mod b/go.mod
index 0df1372b..0efefe2f 100644
--- a/go.mod
+++ b/go.mod
@@ -45,7 +45,7 @@ require (
 	golang.org/x/crypto v0.14.0
 	golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
 	golang.org/x/net v0.17.0 // indirect
-	golang.org/x/sys v0.13.0 // indirect
+	golang.org/x/sys v0.13.0
 	golang.org/x/term v0.13.0
 	golang.org/x/text v0.13.0 // indirect
 	google.golang.org/protobuf v1.30.0 // indirect
diff --git a/gpu/cpu_common.go b/gpu/cpu_common.go
new file mode 100644
index 00000000..11649f6a
--- /dev/null
+++ b/gpu/cpu_common.go
@@ -0,0 +1,21 @@
+package gpu
+
+import (
+	"log"
+
+	"golang.org/x/sys/cpu"
+)
+
+func GetCPUVariant() string {
+	if cpu.X86.HasAVX2 {
+		log.Printf("CPU has AVX2")
+		return "avx2"
+	}
+	if cpu.X86.HasAVX {
+		log.Printf("CPU has AVX")
+		return "avx"
+	}
+	log.Printf("CPU does not have vector extensions")
+	// else LCD
+	return ""
+}
diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go
index 79645285..eac55c42 100644
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -32,8 +32,15 @@ func CheckVRAM() (int64, error) {
 
 func GetGPUInfo() GpuInfo {
 	mem, _ := getCPUMem()
+	if runtime.GOARCH == "amd64" {
+		return GpuInfo{
+			Library: "default",
+			Variant: GetCPUVariant(),
+			memInfo: mem,
+		}
+	}
 	return GpuInfo{
-		Library: "default",
+		Library: "metal",
 		memInfo: mem,
 	}
 }
@@ -45,12 +52,3 @@ func getCPUMem() (memInfo, error) {
 		DeviceCount: 0,
 	}, nil
 }
-
-func nativeInit() error {
-	return nil
-}
-
-func GetCPUVariant() string {
-	// We don't yet have CPU based builds for Darwin...
-	return ""
-}
diff --git a/gpu/gpu_test.go b/gpu/gpu_test.go
index c260211e..010eaea5 100644
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@@ -9,7 +9,7 @@ import (
 
 func TestBasicGetGPUInfo(t *testing.T) {
 	info := GetGPUInfo()
-	assert.Contains(t, "cuda rocm cpu default", info.Library)
+	assert.Contains(t, "cuda rocm cpu metal", info.Library)
 
 	switch runtime.GOOS {
 	case "darwin":
diff --git a/llm/dynamic_shim.c b/llm/dyn_ext_server.c
similarity index 83%
rename from llm/dynamic_shim.c
rename to llm/dyn_ext_server.c
index ca7c372a..111e4ab5 100644
--- a/llm/dynamic_shim.c
+++ b/llm/dyn_ext_server.c
@@ -1,4 +1,4 @@
-#include "dynamic_shim.h"
+#include "dyn_ext_server.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -33,7 +33,7 @@ inline char *LOAD_ERR() {
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
 #endif
 
-void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
+void dyn_init(const char *libPath, struct dynamic_llama_server *s,
                        ext_server_resp_t *err) {
   int i = 0;
   struct lookup {
@@ -83,63 +83,63 @@ void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
   }
 }
 
-inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
+inline void dyn_llama_server_init(struct dynamic_llama_server s,
                                            ext_server_params_t *sparams,
                                            ext_server_resp_t *err) {
   s.llama_server_init(sparams, err);
 }
 
-inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) {
+inline void dyn_llama_server_start(struct dynamic_llama_server s) {
   s.llama_server_start();
 }
 
-inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) {
+inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
   s.llama_server_stop();
 }
 
-inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
+inline void dyn_llama_server_completion(struct dynamic_llama_server s,
                                                  const char *json_req,
                                                  ext_server_resp_t *resp) {
   s.llama_server_completion(json_req, resp);
 }
 
-inline void dynamic_shim_llama_server_completion_next_result(
+inline void dyn_llama_server_completion_next_result(
     struct dynamic_llama_server s, const int task_id,
     ext_server_task_result_t *result) {
   s.llama_server_completion_next_result(task_id, result);
 }
 
-inline void dynamic_shim_llama_server_completion_cancel(
+inline void dyn_llama_server_completion_cancel(
     struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
   s.llama_server_completion_cancel(task_id, err);
 }
-inline void dynamic_shim_llama_server_release_task_result(
+inline void dyn_llama_server_release_task_result(
     struct dynamic_llama_server s, ext_server_task_result_t *result) {
   s.llama_server_release_task_result(result);
 }
 
-inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
+inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
                                                const char *json_req,
                                                char **json_resp,
                                                ext_server_resp_t *err) {
   s.llama_server_tokenize(json_req, json_resp, err);
 }
 
-inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
+inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
                                                  const char *json_req,
                                                  char **json_resp,
                                                  ext_server_resp_t *err) {
   s.llama_server_detokenize(json_req, json_resp, err);
 }
 
-inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
+inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
                                                 const char *json_req,
                                                 char **json_resp,
                                                 ext_server_resp_t *err) {
   s.llama_server_embedding(json_req, json_resp, err);
 }
 
-inline void dynamic_shim_llama_server_release_json_resp(
+inline void dyn_llama_server_release_json_resp(
     struct dynamic_llama_server s, char **json_resp) {
   s.llama_server_release_json_resp(json_resp);
 }
diff --git a/llm/ext_server_common.go b/llm/dyn_ext_server.go
similarity index 72%
rename from llm/ext_server_common.go
rename to llm/dyn_ext_server.go
index b10ac60b..105df634 100644
--- a/llm/ext_server_common.go
+++ b/llm/dyn_ext_server.go
@@ -10,25 +10,25 @@ package llm
 #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 #cgo darwin LDFLAGS: -lc++ -framework Accelerate
 #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libcommon.a
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libext_server.a
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a
 #cgo linux CFLAGS: -D_GNU_SOURCE
 #cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
 #cgo linux windows LDFLAGS: -lpthread
 
 #include <stdlib.h>
-#include "ext_server.h"
+#include "dyn_ext_server.h"
 
 */
 import "C"
+
 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
 	"log"
+	"os"
+	"path/filepath"
+	"runtime"
 	"strings"
 	"sync"
 	"time"
@@ -37,21 +37,9 @@ import (
 	"github.com/jmorganca/ollama/api"
 )
 
-// TODO switch Linux to always be dynamic
-// If that works out, then look at the impact of doing the same for Mac
-type extServer interface {
-	LLM
-	llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
-	llama_server_start()
-	llama_server_stop()
-	llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
-	llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
-	llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
-	llama_server_release_task_result(result *C.ext_server_task_result_t)
-	llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
-	llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
-	llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
-	llama_server_release_json_resp(json_resp **C.char)
+type dynExtServer struct {
+	s       C.struct_dynamic_llama_server
+	options api.Options
 }
 
 // Note: current implementation does not support concurrent instantiations
@@ -76,11 +64,30 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
 	return fmt.Errorf(C.GoString(resp.msg))
 }
 
-func newExtServer(server extServer, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
+// Note: current implementation does not support concurrent instantiations
+var llm *dynExtServer
+
+func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if !mutex.TryLock() {
 		log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
 		mutex.Lock()
 	}
+	updatePath(filepath.Dir(library))
+	libPath := C.CString(library)
+	defer C.free(unsafe.Pointer(libPath))
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	var srv C.struct_dynamic_llama_server
+	C.dyn_init(libPath, &srv, &resp)
+	if resp.id < 0 {
+		mutex.Unlock()
+		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
+	}
+	llm = &dynExtServer{
+		s:       srv,
+		options: opts,
+	}
+	log.Printf("Loading Dynamic llm server: %s", library)
 
 	var sparams C.ext_server_params_t
 	sparams.model = C.CString(model)
@@ -129,20 +136,20 @@ func newExtServer(server extServer, model string, adapters, projectors []string,
 
 	sparams.n_threads = C.uint(opts.NumThread)
 
-	log.Printf("Initializing internal llama server")
-	resp := newExtServerResp(128)
-	defer freeExtServerResp(resp)
-	server.llama_server_init(&sparams, &resp)
-	if resp.id < 0 {
-		return nil, extServerResponseToErr(resp)
+	log.Printf("Initializing llama server")
+	initResp := newExtServerResp(128)
+	defer freeExtServerResp(initResp)
+	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
+	if initResp.id < 0 {
+		return nil, extServerResponseToErr(initResp)
 	}
 
-	log.Printf("Starting internal llama main loop")
-	server.llama_server_start()
-	return server, nil
+	log.Printf("Starting llama main loop")
+	C.dyn_llama_server_start(llm.s)
+	return llm, nil
 }
 
-func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(PredictResult)) error {
+func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	var imageData []ImageData
@@ -200,7 +207,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
 		req := C.CString(buffer.String())
 		defer C.free(unsafe.Pointer(req))
 
-		llm.llama_server_completion(req, &resp)
+		C.dyn_llama_server_completion(llm.s, req, &resp)
 		if resp.id < 0 {
 			return extServerResponseToErr(resp)
 		}
@@ -211,7 +218,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
 			select {
 			case <-ctx.Done():
 				// This handles the request cancellation
-				llm.llama_server_completion_cancel(resp.id, &resp)
+				C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
 				if resp.id < 0 {
 					return extServerResponseToErr(resp)
 				} else {
@@ -219,13 +226,13 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
 				}
 			default:
 				var result C.ext_server_task_result_t
-				llm.llama_server_completion_next_result(resp.id, &result)
+				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
 				json_resp := C.GoString(result.json_resp)
-				llm.llama_server_release_task_result(&result)
+				C.dyn_llama_server_release_task_result(llm.s, &result)
 
 				var p prediction
 				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
-					llm.llama_server_completion_cancel(resp.id, &resp)
+					C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
 					if resp.id < 0 {
 						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
 					} else {
@@ -266,7 +273,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
 	return fmt.Errorf("max retries exceeded")
 }
 
-func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
+func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: prompt})
 	if err != nil {
 		return nil, fmt.Errorf("marshaling encode data: %w", err)
@@ -276,11 +283,11 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
 	var json_resp *C.char
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
-	llm.llama_server_tokenize(req, &json_resp, &resp)
+	C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
 	if resp.id < 0 {
 		return nil, extServerResponseToErr(resp)
 	}
-	defer llm.llama_server_release_json_resp(&json_resp)
+	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
 
 	var encoded TokenizeResponse
 	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
@@ -290,7 +297,7 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
 	return encoded.Tokens, err
 }
 
-func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
+func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
 	if len(tokens) == 0 {
 		return "", nil
 	}
@@ -304,11 +311,11 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
 	var json_resp *C.char
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
-	llm.llama_server_detokenize(req, &json_resp, &resp)
+	C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
 	if resp.id < 0 {
 		return "", extServerResponseToErr(resp)
 	}
-	defer llm.llama_server_release_json_resp(&json_resp)
+	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
 
 	var decoded DetokenizeResponse
 	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
@@ -318,7 +325,7 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
 	return decoded.Content, err
 }
 
-func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
+func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: input})
 	if err != nil {
 		return nil, fmt.Errorf("error marshaling embed data: %w", err)
@@ -329,11 +336,11 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err
 	var json_resp *C.char
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
-	llm.llama_server_embedding(req, &json_resp, &resp)
+	C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
 	if resp.id < 0 {
 		return nil, extServerResponseToErr(resp)
 	}
-	defer llm.llama_server_release_json_resp(&json_resp)
+	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
 
 	var embedding EmbeddingResponse
 	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
@@ -343,7 +350,38 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err
 	return embedding.Embedding, nil
 }
 
-func close(llm extServer) {
-	llm.llama_server_stop()
+func (llm *dynExtServer) Close() {
+	C.dyn_llama_server_stop(llm.s)
 	mutex.Unlock()
 }
+
+func updatePath(dir string) {
+	if runtime.GOOS == "windows" {
+		tmpDir := filepath.Dir(dir)
+		pathComponents := strings.Split(os.Getenv("PATH"), ";")
+		i := 0
+		for _, comp := range pathComponents {
+			if strings.EqualFold(comp, dir) {
+				return
+			}
+			// Remove any other prior paths to our temp dir
+			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
+				pathComponents[i] = comp
+				i++
+			}
+		}
+		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
+		log.Printf("Updating PATH to %s", newPath)
+		os.Setenv("PATH", newPath)
+	} else {
+		pathComponents := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
+		for _, comp := range pathComponents {
+			if comp == dir {
+				return
+			}
+		}
+		newPath := strings.Join(append([]string{dir}, pathComponents...), ":")
+		log.Printf("Updating LD_LIBRARY_PATH to %s", newPath)
+		os.Setenv("LD_LIBRARY_PATH", newPath)
+	}
+}
diff --git a/llm/dynamic_shim.h b/llm/dyn_ext_server.h
similarity index 75%
rename from llm/dynamic_shim.h
rename to llm/dyn_ext_server.h
index 116ca722..cddf4a1f 100644
--- a/llm/dynamic_shim.h
+++ b/llm/dyn_ext_server.h
@@ -27,46 +27,46 @@ struct dynamic_llama_server {
   void (*llama_server_release_json_resp)(char **json_resp);
 };
 
-void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
+void dyn_init(const char *libPath, struct dynamic_llama_server *s,
                        ext_server_resp_t *err);
 
 // No good way to call C function pointers from Go so inline the indirection
-void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
+void dyn_llama_server_init(struct dynamic_llama_server s,
                                     ext_server_params_t *sparams,
                                     ext_server_resp_t *err);
 
-void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
+void dyn_llama_server_start(struct dynamic_llama_server s);
 
-void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
+void dyn_llama_server_stop(struct dynamic_llama_server s);
 
-void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
+void dyn_llama_server_completion(struct dynamic_llama_server s,
                                           const char *json_req,
                                           ext_server_resp_t *resp);
 
-void dynamic_shim_llama_server_completion_next_result(
+void dyn_llama_server_completion_next_result(
     struct dynamic_llama_server s, const int task_id,
     ext_server_task_result_t *result);
 
-void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
+void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
                                                  const int task_id,
                                                  ext_server_resp_t *err);
 
-void dynamic_shim_llama_server_release_task_result(
+void dyn_llama_server_release_task_result(
     struct dynamic_llama_server s, ext_server_task_result_t *result);
 
-void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
+void dyn_llama_server_tokenize(struct dynamic_llama_server s,
                                         const char *json_req, char **json_resp,
                                         ext_server_resp_t *err);
 
-void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
+void dyn_llama_server_detokenize(struct dynamic_llama_server s,
                                           const char *json_req,
                                           char **json_resp,
                                           ext_server_resp_t *err);
 
-void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
+void dyn_llama_server_embedding(struct dynamic_llama_server s,
                                          const char *json_req, char **json_resp,
                                          ext_server_resp_t *err);
-void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
+void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
                                                  char **json_resp);
 
 #ifdef __cplusplus
diff --git a/llm/ext_server.go b/llm/ext_server.go
deleted file mode 100644
index c8a5f0b9..00000000
--- a/llm/ext_server.go
+++ /dev/null
@@ -1,17 +0,0 @@
-//go:build !darwin
-
-package llm
-
-import (
-	"fmt"
-
-	"github.com/jmorganca/ollama/api"
-)
-
-func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
-	// On windows and linux we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
-	// This ensures we can update the PATH at runtime to get everything loaded
-
-	// This should never happen as we'll always try to load one or more cpu dynamic libaries before hitting default
-	return nil, fmt.Errorf("no available default llm library")
-}
diff --git a/llm/ext_server_default.go b/llm/ext_server_default.go
deleted file mode 100644
index 31f05fb6..00000000
--- a/llm/ext_server_default.go
+++ /dev/null
@@ -1,82 +0,0 @@
-//go:build darwin
-
-package llm
-
-/*
-#include <stdlib.h>
-#include "ext_server.h"
-
-*/
-import "C"
-import (
-	"context"
-
-	"github.com/jmorganca/ollama/api"
-)
-
-// TODO - explore shifting Darwin to a dynamic loading pattern for consistency with Linux and Windows
-
-type llamaExtServer struct {
-	api.Options
-}
-
-func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
-	C.llama_server_init(sparams, err)
-}
-func (llm *llamaExtServer) llama_server_start() {
-	C.llama_server_start()
-}
-func (llm *llamaExtServer) llama_server_stop() {
-	C.llama_server_stop()
-}
-
-func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
-	C.llama_server_completion(json_req, resp)
-}
-func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
-	C.llama_server_completion_next_result(task_id, resp)
-}
-func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
-	C.llama_server_completion_cancel(task_id, err)
-}
-func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
-	C.llama_server_release_task_result(result)
-}
-
-func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
-	C.llama_server_tokenize(json_req, json_resp, err)
-}
-func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
-	C.llama_server_detokenize(json_req, json_resp, err)
-}
-func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
-	C.llama_server_embedding(json_req, json_resp, err)
-}
-func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
-	C.llama_server_release_json_resp(json_resp)
-}
-
-func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
-	server := &llamaExtServer{opts}
-	return newExtServer(server, model, adapters, projectors, opts)
-}
-
-func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
-	return predict(ctx, llm, pred, fn)
-}
-
-func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
-	return encode(llm, ctx, prompt)
-}
-
-func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
-	return decode(llm, ctx, tokens)
-}
-
-func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
-	return embedding(llm, ctx, input)
-}
-
-func (llm *llamaExtServer) Close() {
-	close(llm)
-}
diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
index cabd8f75..b7f1f684 100755
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -29,4 +29,16 @@ git_module_setup
 apply_patches
 build
 install
+gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
+    -Wl,-force_load ${BUILD_DIR}/lib/libext_server.a \
+    ${BUILD_DIR}/lib/libcommon.a \
+    ${BUILD_DIR}/lib/libllama.a \
+    ${BUILD_DIR}/lib/libggml_static.a \
+    -lpthread -ldl -lm -lc++ \
+    -framework Accelerate \
+    -framework Foundation \
+    -framework Metal \
+    -framework MetalKit \
+    -framework MetalPerformanceShaders
+
 cleanup
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 3fec7e6b..0c940ba5 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -104,12 +104,6 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
         build
         install
         link_server_lib
-        gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
-            -Wl,--whole-archive \
-            ${BUILD_DIR}/lib/libext_server.a \
-            -Wl,--no-whole-archive \
-            ${BUILD_DIR}/lib/libcommon.a \
-            ${BUILD_DIR}/lib/libllama.a
     fi
 else
     echo "Skipping CPU generation step as requested"
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 9435fffa..1bc08c69 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -4,7 +4,7 @@ $ErrorActionPreference = "Stop"
 
 function init_vars {
     $script:llamacppDir = "../llama.cpp"
-    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64")
+    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off",  "-A","x64")
     $script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
     if ($env:CGO_CFLAGS -contains "-g") {
         $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
@@ -63,16 +63,36 @@ init_vars
 git_module_setup
 apply_patches
 
-# first build CPU based
-$script:buildDir="${script:llamacppDir}/build/windows/cpu"
+# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
+# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
+# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
+# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
 
+$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")
+
+$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+$script:buildDir="${script:llamacppDir}/build/windows/cpu"
+write-host "Building LCD CPU"
+build
+install
+
+$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx"
+write-host "Building AVX CPU"
+build
+install
+
+$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
+$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx2"
+write-host "Building AVX2 CPU"
 build
 install
 
 # Then build cuda as a dynamically loaded library
+# TODO figure out how to detect cuda version
 init_vars
 $script:buildDir="${script:llamacppDir}/build/windows/cuda"
-$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON")
+$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
 build
 install
 
diff --git a/llm/llm.go b/llm/llm.go
index 05230b09..a414c3da 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -138,33 +138,30 @@ func Init(workdir string) error {
 	return nativeInit(workdir)
 }
 
-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
-	shims := getShims(gpuInfo)
+func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+	dynLibs := getDynLibs(gpuInfo)
 
 	// Check to see if the user has requested a specific library instead of auto-detecting
 	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
 	if demandLib != "" {
-		libPath := availableShims[demandLib]
+		libPath := availableDynLibs[demandLib]
 		if libPath == "" {
 			log.Printf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib)
 		} else {
 			log.Printf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib)
-			shims = []string{libPath}
+			dynLibs = []string{libPath}
 		}
 	}
 
-	for _, shim := range shims {
-		// TODO - only applies on Darwin (switch to fully dynamic there too...)
-		if shim == "default" {
-			break
-		}
-		srv, err := newDynamicShimExtServer(shim, model, adapters, projectors, opts)
+	err2 := fmt.Errorf("unable to locate suitable llm library")
+	for _, dynLib := range dynLibs {
+		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
 		if err == nil {
 			return srv, nil
 		}
-		log.Printf("Failed to load dynamic library %s  %s", shim, err)
+		log.Printf("Failed to load dynamic library %s  %s", dynLib, err)
+		err2 = err
 	}
 
-	return newDefaultExtServer(model, adapters, projectors, opts)
-
+	return nil, err2
 }
diff --git a/llm/shim.go b/llm/payload_common.go
similarity index 84%
rename from llm/shim.go
rename to llm/payload_common.go
index e68a8ec3..f6976768 100644
--- a/llm/shim.go
+++ b/llm/payload_common.go
@@ -18,42 +18,42 @@ import (
 // Libraries names may contain an optional variant separated by '_'
 // For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
 // Any library without a variant is the lowest common denominator
-var availableShims = map[string]string{}
+var availableDynLibs = map[string]string{}
 
 const pathComponentCount = 6
 
-// getShims returns an ordered list of shims to try, starting with the best
-func getShims(gpuInfo gpu.GpuInfo) []string {
+// getDynLibs returns an ordered list of LLM libraries to try, starting with the best
+func getDynLibs(gpuInfo gpu.GpuInfo) []string {
 	// Short circuit if we know we're using the default built-in (darwin only)
 	if gpuInfo.Library == "default" {
 		return []string{"default"}
 	}
 
 	exactMatch := ""
-	shims := []string{}
-	altShims := []string{}
+	dynLibs := []string{}
+	altDynLibs := []string{}
 	requested := gpuInfo.Library
 	if gpuInfo.Variant != "" {
 		requested += "_" + gpuInfo.Variant
 	}
 	// Try to find an exact match
-	for cmp := range availableShims {
+	for cmp := range availableDynLibs {
 		if requested == cmp {
 			exactMatch = cmp
-			shims = []string{availableShims[cmp]}
+			dynLibs = []string{availableDynLibs[cmp]}
 			break
 		}
 	}
 	// Then for GPUs load alternates and sort the list for consistent load ordering
 	if gpuInfo.Library != "cpu" {
-		for cmp := range availableShims {
+		for cmp := range availableDynLibs {
 			if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
-				altShims = append(altShims, cmp)
+				altDynLibs = append(altDynLibs, cmp)
 			}
 		}
-		slices.Sort(altShims)
-		for _, altShim := range altShims {
-			shims = append(shims, availableShims[altShim])
+		slices.Sort(altDynLibs)
+		for _, altDynLib := range altDynLibs {
+			dynLibs = append(dynLibs, availableDynLibs[altDynLib])
 		}
 	}
 
@@ -65,27 +65,27 @@ func getShims(gpuInfo gpu.GpuInfo) []string {
 		// Attempting to run the wrong CPU instructions will panic the
 		// process
 		if variant != "" {
-			for cmp := range availableShims {
+			for cmp := range availableDynLibs {
 				if cmp == "cpu_"+variant {
-					shims = append(shims, availableShims[cmp])
+					dynLibs = append(dynLibs, availableDynLibs[cmp])
 					break
 				}
 			}
 		} else {
-			shims = append(shims, availableShims["cpu"])
+			dynLibs = append(dynLibs, availableDynLibs["cpu"])
 		}
 	}
 
 	// Finaly, if we didn't find any matches, LCD CPU FTW
-	if len(shims) == 0 {
-		shims = []string{availableShims["cpu"]}
+	if len(dynLibs) == 0 {
+		dynLibs = []string{availableDynLibs["cpu"]}
 	}
-	return shims
+	return dynLibs
 }
 
-func rocmShimPresent() bool {
-	for shimName := range availableShims {
-		if strings.HasPrefix(shimName, "rocm") {
+func rocmDynLibPresent() bool {
+	for dynLibName := range availableDynLibs {
+		if strings.HasPrefix(dynLibName, "rocm") {
 			return true
 		}
 	}
@@ -104,7 +104,6 @@ func nativeInit(workdir string) error {
 			return err
 		}
 		os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
-		return nil
 	}
 
 	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
@@ -118,7 +117,7 @@ func nativeInit(workdir string) error {
 	for _, lib := range libs {
 		// The last dir component is the variant name
 		variant := filepath.Base(filepath.Dir(lib))
-		availableShims[variant] = lib
+		availableDynLibs[variant] = lib
 	}
 
 	if err := verifyDriverAccess(); err != nil {
@@ -126,9 +125,9 @@ func nativeInit(workdir string) error {
 	}
 
 	// Report which dynamic libraries we have loaded to assist troubleshooting
-	variants := make([]string, len(availableShims))
+	variants := make([]string, len(availableDynLibs))
 	i := 0
-	for variant := range availableShims {
+	for variant := range availableDynLibs {
 		variants[i] = variant
 		i++
 	}
@@ -226,7 +225,7 @@ func verifyDriverAccess() error {
 		return nil
 	}
 	// Only check ROCm access if we have the dynamic lib loaded
-	if rocmShimPresent() {
+	if rocmDynLibPresent() {
 		// Verify we have permissions - either running as root, or we have group access to the driver
 		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
 		if err != nil {
diff --git a/llm/payload_darwin.go b/llm/payload_darwin.go
new file mode 100644
index 00000000..1a5f042a
--- /dev/null
+++ b/llm/payload_darwin.go
@@ -0,0 +1,8 @@
+package llm
+
+import (
+	"embed"
+)
+
+//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/*/lib/*.so
+var libEmbed embed.FS
diff --git a/llm/payload_linux.go b/llm/payload_linux.go
new file mode 100644
index 00000000..afef040a
--- /dev/null
+++ b/llm/payload_linux.go
@@ -0,0 +1,8 @@
+package llm
+
+import (
+	"embed"
+)
+
+//go:embed llama.cpp/build/linux/*/lib/*.so
+var libEmbed embed.FS
diff --git a/llm/payload_test.go b/llm/payload_test.go
new file mode 100644
index 00000000..7a644713
--- /dev/null
+++ b/llm/payload_test.go
@@ -0,0 +1,54 @@
+package llm
+
+import (
+	"testing"
+
+	"github.com/jmorganca/ollama/gpu"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestGetDynLibs(t *testing.T) {
+	availableDynLibs = map[string]string{
+		"cpu": "X_cpu",
+	}
+	assert.Equal(t, false, rocmDynLibPresent())
+	res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
+	assert.Len(t, res, 1)
+	assert.Equal(t, availableDynLibs["cpu"], res[0])
+
+	availableDynLibs = map[string]string{
+		"rocm_v5": "X_rocm_v5",
+		"rocm_v6": "X_rocm_v6",
+		"cpu":     "X_cpu",
+	}
+	assert.Equal(t, true, rocmDynLibPresent())
+	res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
+	assert.Len(t, res, 3)
+	assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
+	assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
+	assert.Equal(t, availableDynLibs["cpu"], res[2])
+
+	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
+	assert.Len(t, res, 3)
+	assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
+	assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
+	assert.Equal(t, availableDynLibs["cpu"], res[2])
+
+	res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
+	assert.Len(t, res, 1)
+	assert.Equal(t, availableDynLibs["cpu"], res[0])
+
+	res = getDynLibs(gpu.GpuInfo{Library: "default"})
+	assert.Len(t, res, 1)
+	assert.Equal(t, "default", res[0])
+
+	availableDynLibs = map[string]string{
+		"rocm": "X_rocm_v5",
+		"cpu":  "X_cpu",
+	}
+	assert.Equal(t, true, rocmDynLibPresent())
+	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
+	assert.Len(t, res, 2)
+	assert.Equal(t, availableDynLibs["rocm"], res[0])
+	assert.Equal(t, availableDynLibs["cpu"], res[1])
+}
diff --git a/llm/payload_windows.go b/llm/payload_windows.go
new file mode 100644
index 00000000..21c6cc4d
--- /dev/null
+++ b/llm/payload_windows.go
@@ -0,0 +1,8 @@
+package llm
+
+import (
+	"embed"
+)
+
+//go:embed llama.cpp/build/windows/*/lib/*.dll
+var libEmbed embed.FS
diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go
deleted file mode 100644
index 9ef8ef96..00000000
--- a/llm/shim_darwin.go
+++ /dev/null
@@ -1,16 +0,0 @@
-package llm
-
-import (
-	"embed"
-	"fmt"
-
-	"github.com/jmorganca/ollama/api"
-)
-
-//go:embed llama.cpp/ggml-metal.metal
-var libEmbed embed.FS
-
-func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
-	// should never happen...
-	return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
-}
diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go
deleted file mode 100644
index 102f059c..00000000
--- a/llm/shim_ext_server.go
+++ /dev/null
@@ -1,107 +0,0 @@
-//go:build !darwin
-
-package llm
-
-/*
-
-#include <stdlib.h>
-#include "dynamic_shim.h"
-
-*/
-import "C"
-import (
-	"context"
-	"fmt"
-	"log"
-	"path/filepath"
-	"sync"
-	"unsafe"
-
-	"github.com/jmorganca/ollama/api"
-)
-
-type shimExtServer struct {
-	s       C.struct_dynamic_llama_server
-	options api.Options
-}
-
-// Note: current implementation does not support concurrent instantiations
-var shimMutex sync.Mutex
-var llm *shimExtServer
-
-func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
-	C.dynamic_shim_llama_server_init(llm.s, sparams, err)
-}
-func (llm *shimExtServer) llama_server_start() {
-	C.dynamic_shim_llama_server_start(llm.s)
-}
-func (llm *shimExtServer) llama_server_stop() {
-	C.dynamic_shim_llama_server_stop(llm.s)
-}
-
-func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
-	C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
-}
-func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
-	C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
-}
-func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
-	C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
-}
-func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
-	C.dynamic_shim_llama_server_release_task_result(llm.s, result)
-}
-
-func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
-	C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
-}
-func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
-	C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
-}
-func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
-	C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
-}
-func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
-	C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
-}
-
-func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
-	shimMutex.Lock()
-	defer shimMutex.Unlock()
-	updatePath(filepath.Dir(library))
-	libPath := C.CString(library)
-	defer C.free(unsafe.Pointer(libPath))
-	resp := newExtServerResp(128)
-	defer freeExtServerResp(resp)
-	var srv C.struct_dynamic_llama_server
-	C.dynamic_shim_init(libPath, &srv, &resp)
-	if resp.id < 0 {
-		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
-	}
-	llm = &shimExtServer{
-		s:       srv,
-		options: opts,
-	}
-	log.Printf("Loading Dynamic Shim llm server: %s", library)
-	return newExtServer(llm, model, adapters, projectors, opts)
-}
-
-func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
-	return predict(ctx, llm, pred, fn)
-}
-
-func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
-	return encode(llm, ctx, prompt)
-}
-
-func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
-	return decode(llm, ctx, tokens)
-}
-
-func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
-	return embedding(llm, ctx, input)
-}
-
-func (llm *shimExtServer) Close() {
-	close(llm)
-}
diff --git a/llm/shim_ext_server_linux.go b/llm/shim_ext_server_linux.go
deleted file mode 100644
index e4bfd15e..00000000
--- a/llm/shim_ext_server_linux.go
+++ /dev/null
@@ -1,23 +0,0 @@
-package llm
-
-import (
-	"embed"
-	"log"
-	"os"
-	"strings"
-)
-
-//go:embed llama.cpp/build/*/*/lib/*.so
-var libEmbed embed.FS
-
-func updatePath(dir string) {
-	pathComponents := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
-	for _, comp := range pathComponents {
-		if comp == dir {
-			return
-		}
-	}
-	newPath := strings.Join(append([]string{dir}, pathComponents...), ":")
-	log.Printf("Updating LD_LIBRARY_PATH to %s", newPath)
-	os.Setenv("LD_LIBRARY_PATH", newPath)
-}
diff --git a/llm/shim_ext_server_windows.go b/llm/shim_ext_server_windows.go
deleted file mode 100644
index c218c6f3..00000000
--- a/llm/shim_ext_server_windows.go
+++ /dev/null
@@ -1,31 +0,0 @@
-package llm
-
-import (
-	"embed"
-	"log"
-	"os"
-	"path/filepath"
-	"strings"
-)
-
-//go:embed llama.cpp/build/windows/*/lib/*.dll
-var libEmbed embed.FS
-
-func updatePath(dir string) {
-	tmpDir := filepath.Dir(dir)
-	pathComponents := strings.Split(os.Getenv("PATH"), ";")
-	i := 0
-	for _, comp := range pathComponents {
-		if strings.EqualFold(comp, dir) {
-			return
-		}
-		// Remove any other prior paths to our temp dir
-		if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
-			pathComponents[i] = comp
-			i++
-		}
-	}
-	newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
-	log.Printf("Updating PATH to %s", newPath)
-	os.Setenv("PATH", newPath)
-}
diff --git a/llm/shim_test.go b/llm/shim_test.go
deleted file mode 100644
index 8d49ce14..00000000
--- a/llm/shim_test.go
+++ /dev/null
@@ -1,54 +0,0 @@
-package llm
-
-import (
-	"testing"
-
-	"github.com/jmorganca/ollama/gpu"
-	"github.com/stretchr/testify/assert"
-)
-
-func TestGetShims(t *testing.T) {
-	availableShims = map[string]string{
-		"cpu": "X_cpu",
-	}
-	assert.Equal(t, false, rocmShimPresent())
-	res := getShims(gpu.GpuInfo{Library: "cpu"})
-	assert.Len(t, res, 1)
-	assert.Equal(t, availableShims["cpu"], res[0])
-
-	availableShims = map[string]string{
-		"rocm_v5": "X_rocm_v5",
-		"rocm_v6": "X_rocm_v6",
-		"cpu":     "X_cpu",
-	}
-	assert.Equal(t, true, rocmShimPresent())
-	res = getShims(gpu.GpuInfo{Library: "rocm"})
-	assert.Len(t, res, 3)
-	assert.Equal(t, availableShims["rocm_v5"], res[0])
-	assert.Equal(t, availableShims["rocm_v6"], res[1])
-	assert.Equal(t, availableShims["cpu"], res[2])
-
-	res = getShims(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
-	assert.Len(t, res, 3)
-	assert.Equal(t, availableShims["rocm_v6"], res[0])
-	assert.Equal(t, availableShims["rocm_v5"], res[1])
-	assert.Equal(t, availableShims["cpu"], res[2])
-
-	res = getShims(gpu.GpuInfo{Library: "cuda"})
-	assert.Len(t, res, 1)
-	assert.Equal(t, availableShims["cpu"], res[0])
-
-	res = getShims(gpu.GpuInfo{Library: "default"})
-	assert.Len(t, res, 1)
-	assert.Equal(t, "default", res[0])
-
-	availableShims = map[string]string{
-		"rocm": "X_rocm_v5",
-		"cpu":  "X_cpu",
-	}
-	assert.Equal(t, true, rocmShimPresent())
-	res = getShims(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
-	assert.Len(t, res, 2)
-	assert.Equal(t, availableShims["rocm"], res[0])
-	assert.Equal(t, availableShims["cpu"], res[1])
-}