From 6c5ccb11f993ccc88c4761b8c31e0fefcbc1900f Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Thu, 15 Feb 2024 17:15:09 -0800
Subject: [PATCH] Revamp ROCm support

This refines where we extract the LLM libraries to by adding a new
OLLAMA_HOME env var, that defaults to `~/.ollama` The logic was already
idempotenent, so this should speed up startups after the first time a
new release is deployed.  It also cleans up after itself.

We now build only a single ROCm version (latest major) on both windows
and linux.  Given the large size of ROCms tensor files, we split the
dependency out.  It's bundled into the installer on windows, and a
separate download on windows.  The linux install script is now smart and
detects the presence of AMD GPUs and looks to see if rocm v6 is already
present, and if not, then downloads our dependency tar file.

For Linux discovery, we now use sysfs and check each GPU against what
ROCm supports so we can degrade to CPU gracefully instead of having
llama.cpp+rocm assert/crash on us.  For Windows, we now use go's windows
dynamic library loading logic to access the amdhip64.dll APIs to query
the GPU information.
---
 .github/workflows/test.yaml  |  24 +-
 Dockerfile                   |  27 +--
 app/ollama.iss               |   8 +
 docs/development.md          |   4 +-
 docs/linux.md                |   8 +
 docs/troubleshooting.md      |  37 ++++
 docs/windows.md              |   3 +-
 gpu/amd.go                   | 101 ---------
 gpu/amd_common.go            |  58 +++++
 gpu/amd_hip_windows.go       | 141 ++++++++++++
 gpu/amd_linux.go             | 411 +++++++++++++++++++++++++++++++++++
 gpu/amd_windows.go           | 190 ++++++++++++++++
 gpu/assets.go                |  60 +++++
 gpu/gpu.go                   | 110 +---------
 gpu/gpu_info.h               |   1 -
 gpu/gpu_info_cuda.c          |  10 +-
 gpu/gpu_info_rocm.c          | 198 -----------------
 gpu/gpu_info_rocm.h          |  59 -----
 llm/dyn_ext_server.c         |  19 +-
 llm/dyn_ext_server.go        |  27 +--
 llm/generate/gen_linux.sh    |  14 +-
 llm/generate/gen_windows.ps1 |  90 +++++++-
 llm/llm.go                   |  12 +-
 llm/payload_common.go        |  58 +++--
 llm/payload_linux.go         |   2 +-
 scripts/build_linux.sh       |   1 +
 server/routes.go             |   6 +-
 27 files changed, 1091 insertions(+), 588 deletions(-)
 delete mode 100644 gpu/amd.go
 create mode 100644 gpu/amd_common.go
 create mode 100644 gpu/amd_hip_windows.go
 create mode 100644 gpu/amd_linux.go
 create mode 100644 gpu/amd_windows.go
 create mode 100644 gpu/assets.go
 delete mode 100644 gpu/gpu_info_rocm.c
 delete mode 100644 gpu/gpu_info_rocm.h

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 51be919d..11008042 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -7,12 +7,12 @@ jobs:
   generate:
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, macos-latest, windows-2019]
         arch: [amd64, arm64]
         exclude:
           - os: ubuntu-latest
             arch: arm64
-          - os: windows-latest
+          - os: windows-2019
             arch: arm64
     runs-on: ${{ matrix.os }}
     env:
@@ -24,7 +24,18 @@ jobs:
           go-version: '1.22'
           cache: true
       - run: go get ./...
+      - run: |
+          $gopath=(get-command go).source | split-path -parent
+          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          cd $env:GITHUB_WORKSPACE
+          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$env:PATH"
+          go generate -x ./...
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        name: "Windows Go Generate"
       - run: go generate -x ./...
+        if: ${{ ! startsWith(matrix.os, 'windows-') }}
+        name: "Unix Go Generate"
       - uses: actions/upload-artifact@v4
         with:
           name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
@@ -62,7 +73,6 @@ jobs:
     strategy:
       matrix:
         rocm-version:
-          - '5.7.1'
           - '6.0'
     runs-on: linux
     container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
@@ -91,12 +101,12 @@ jobs:
   lint:
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, macos-latest, windows-2019]
         arch: [amd64, arm64]
         exclude:
           - os: ubuntu-latest
             arch: arm64
-          - os: windows-latest
+          - os: windows-2019
             arch: arm64
           - os: macos-latest
             arch: amd64
@@ -130,12 +140,12 @@ jobs:
     needs: generate
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, macos-latest, windows-2019]
         arch: [amd64]
         exclude:
           - os: ubuntu-latest
             arch: arm64
-          - os: windows-latest
+          - os: windows-2019
             arch: arm64
     runs-on: ${{ matrix.os }}
     env:
diff --git a/Dockerfile b/Dockerfile
index f996bba0..741b6b08 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,7 @@
 ARG GOLANG_VERSION=1.22.1
 ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION=11.3.1
+ARG ROCM_VERSION=6.0
 
 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code
@@ -28,7 +29,7 @@ WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 
-FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -39,18 +40,14 @@ WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
 RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN mkdir /tmp/scratch && \
+    for dep in $(cat /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/x86_64/rocm*/lib/deps.txt) ; do \
+        cp ${dep} /tmp/scratch/ || exit 1 ; \
+    done && \
+    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
+    mkdir -p /go/src/github.com/jmorganca/ollama/dist/deps/ && \
+    (cd /tmp/scratch/ && tar czvf /go/src/github.com/jmorganca/ollama/dist/deps/rocm-amd64-deps.tgz . )
 
-FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64
-ARG CMAKE_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-ENV LIBRARY_PATH /opt/amdgpu/lib64
-COPY --from=llm-code / /go/src/github.com/jmorganca/ollama/
-WORKDIR /go/src/github.com/jmorganca/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG AMDGPU_TARGETS
-RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 
 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
@@ -91,8 +88,8 @@ COPY . .
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
 COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=rocm-5-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
-COPY --from=rocm-6-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN go build .
@@ -117,7 +114,7 @@ RUN apt-get update && apt-get install -y ca-certificates
 COPY --from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 
 # Radeon images are much larger so we keep it distinct from the CPU/CUDA image
-FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete as runtime-rocm
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
 RUN update-pciids
 COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 EXPOSE 11434
diff --git a/app/ollama.iss b/app/ollama.iss
index 473a85b3..df61ac4c 100644
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -91,6 +91,14 @@ Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\windeps\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
+; Assumes v5.7, may need adjustments for v6
+#if GetEnv("HIP_PATH") != ""
+  Source: "{#GetEnv('HIP_PATH')}\bin\hipblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
+  Source: "{#GetEnv('HIP_PATH')}\bin\rocblas.dll"; DestDir: "{app}\rocm\"; Flags: ignoreversion
+  ; amdhip64.dll dependency comes from the driver and must be installed already
+  Source: "{#GetEnv('HIP_PATH')}\bin\rocblas\library\*"; DestDir: "{app}\rocm\rocblas\library\"; Flags: ignoreversion
+#endif
+
 
 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
diff --git a/docs/development.md b/docs/development.md
index 33110e01..993aed9e 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -116,7 +116,7 @@ Note: The windows build for Ollama is still under development.
 
 Install required tools:
 
-- MSVC toolchain - C/C++ and cmake as minimal requirements
+- MSVC toolchain - C/C++ and cmake as minimal requirements - You must build from a "Developer Shell" with the environment variables set
 - go version 1.22 or higher
 - MinGW (pick one variant) with GCC.
   - <https://www.mingw-w64.org/>
@@ -132,6 +132,6 @@ go build .
 
 #### Windows CUDA (NVIDIA)
 
-In addition to the common Windows development tools described above, install:
+In addition to the common Windows development tools described above, install CUDA **AFTER** you install MSVC.
 
 - [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
diff --git a/docs/linux.md b/docs/linux.md
index 29110b05..c7014ece 100644
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -10,6 +10,14 @@ Install Ollama running this one-liner:
 curl -fsSL https://ollama.com/install.sh | sh
 ```
 
+## AMD Radeon GPU support
+
+While AMD has contributed the `amdgpu` driver upstream to the official linux
+kernel source, the version is older and may not support all ROCm features. We
+recommend you install the latest driver from
+https://www.amd.com/en/support/linux-drivers for best support of your Radeon
+GPU.
+
 ## Manual install
 
 ### Download the `ollama` binary
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index a5fb301f..1f2a6b1f 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -67,6 +67,43 @@ You can see what features your CPU has with the following.
 cat /proc/cpuinfo| grep flags  | head -1
 ```
 
+## AMD Radeon GPU Support
+
+Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
+some cases you can force the system to try to use a close GPU type.  For example
+The Radeon RX 5400 is `gfx1034` (also known as 10.3.4) however, ROCm does not
+support this patch-level, the closest support is `gfx1030`.  You can use the
+environment variable `HSA_OVERRIDE_GFX_VERSION` with `x.y.z` syntax.  So for
+example, to force the system to run on the RX 5400, you would set
+`HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the server.
+
+At this time, the known supported GPU types are the following: (This may change from
+release to release)
+- gfx900
+- gfx906
+- gfx908
+- gfx90a
+- gfx940
+- gfx941
+- gfx942
+- gfx1030
+- gfx1100
+- gfx1101
+- gfx1102
+
+This will not work for all unsupported GPUs.  Reach out on [Discord](https://discord.gg/ollama)
+or file an [issue](https://github.com/ollama/ollama/issues) for additional help.
+
+
+## Installing older versions on Linux
+
+If you run into problems on Linux and want to install an older version you can tell the install script
+which version to install.
+
+```sh
+curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.27" sh
+```
+
 ## Known issues
 
 * N/A
\ No newline at end of file
diff --git a/docs/windows.md b/docs/windows.md
index 875a3dc0..49d579c9 100644
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -4,7 +4,7 @@ Welcome to the Ollama Windows preview.
 
 No more WSL required!
 
-Ollama now runs as a native Windows application, including NVIDIA GPU support.
+Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
 After installing Ollama Windows Preview, Ollama will run in the background and
 the `ollama` command line is available in `cmd`, `powershell` or your favorite
 terminal application. As usual the Ollama [api](./api.md) will be served on
@@ -21,6 +21,7 @@ Logs will often be helpful in dianosing the problem (see
 
 * Windows 10 or newer, Home or Pro
 * NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
+* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
 
 ## API Access
 
diff --git a/gpu/amd.go b/gpu/amd.go
deleted file mode 100644
index c21b7741..00000000
--- a/gpu/amd.go
+++ /dev/null
@@ -1,101 +0,0 @@
-package gpu
-
-import (
-	"bufio"
-	"errors"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-)
-
-// TODO - windows vs. non-windows vs darwin
-
-// Discovery logic for AMD/ROCm GPUs
-
-const (
-	DriverVersionFile     = "/sys/module/amdgpu/version"
-	GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
-	// TODO probably break these down per GPU to make the logic simpler
-	GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
-	GPUUsedMemoryFileGlob  = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
-)
-
-func AMDDetected() bool {
-	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
-	sysfsDir := filepath.Dir(DriverVersionFile)
-	_, err := os.Stat(sysfsDir)
-	if errors.Is(err, os.ErrNotExist) {
-		slog.Debug("amd driver not detected " + sysfsDir)
-		return false
-	} else if err != nil {
-		slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
-		return false
-	}
-	return true
-}
-
-func AMDDriverVersion() (string, error) {
-	_, err := os.Stat(DriverVersionFile)
-	if err != nil {
-		return "", fmt.Errorf("amdgpu file stat error: %s %w", DriverVersionFile, err)
-	}
-	fp, err := os.Open(DriverVersionFile)
-	if err != nil {
-		return "", err
-	}
-	defer fp.Close()
-	verString, err := io.ReadAll(fp)
-	if err != nil {
-		return "", err
-	}
-	return strings.TrimSpace(string(verString)), nil
-}
-
-func AMDGFXVersions() []Version {
-	res := []Version{}
-	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
-	for _, match := range matches {
-		fp, err := os.Open(match)
-		if err != nil {
-			slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
-			continue
-		}
-		defer fp.Close()
-
-		scanner := bufio.NewScanner(fp)
-		// optionally, resize scanner's capacity for lines over 64K, see next example
-		for scanner.Scan() {
-			line := strings.TrimSpace(scanner.Text())
-			if strings.HasPrefix(line, "gfx_target_version") {
-				ver := strings.Fields(line)
-				if len(ver) != 2 || len(ver[1]) < 5 {
-					slog.Debug("malformed " + line)
-					continue
-				}
-				l := len(ver[1])
-				patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
-				minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
-				major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
-				if err1 != nil || err2 != nil || err3 != nil {
-					slog.Debug("malformed int " + line)
-					continue
-				}
-
-				res = append(res, Version{
-					Major: uint(major),
-					Minor: uint(minor),
-					Patch: uint(patch),
-				})
-			}
-		}
-	}
-	return res
-}
-
-func (v Version) ToGFXString() string {
-	return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
-}
diff --git a/gpu/amd_common.go b/gpu/amd_common.go
new file mode 100644
index 00000000..deb931ff
--- /dev/null
+++ b/gpu/amd_common.go
@@ -0,0 +1,58 @@
+//go:build linux || windows
+
+package gpu
+
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+// Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
+func rocmLibUsable(libDir string) bool {
+	slog.Debug("evaluating potential rocm lib dir " + libDir)
+	for _, g := range ROCmLibGlobs {
+		res, _ := filepath.Glob(filepath.Join(libDir, g))
+		if len(res) == 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func GetSupportedGFX(libDir string) ([]string, error) {
+	var ret []string
+	files, err := filepath.Glob(filepath.Join(libDir, "rocblas", "library", "TensileLibrary_lazy_gfx*.dat"))
+	if err != nil {
+		return nil, err
+	}
+	for _, file := range files {
+		ret = append(ret, strings.TrimSuffix(strings.TrimPrefix(filepath.Base(file), "TensileLibrary_lazy_"), ".dat"))
+	}
+	return ret, nil
+}
+
+func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
+	// Set the visible devices if not already set
+	// TODO - does sort order matter?
+	devices := []string{}
+	for i := range ids {
+		slog.Debug(fmt.Sprintf("i=%d", i))
+		if _, skipped := skip[i]; skipped {
+			slog.Debug("skipped")
+			continue
+		}
+		devices = append(devices, strconv.Itoa(i))
+	}
+	slog.Debug(fmt.Sprintf("devices=%v", devices))
+
+	val := strings.Join(devices, ",")
+	err := os.Setenv("HIP_VISIBLE_DEVICES", val)
+	if err != nil {
+		slog.Warn(fmt.Sprintf("failed to set env: %s", err))
+	}
+	slog.Debug("HIP_VISIBLE_DEVICES=" + val)
+}
diff --git a/gpu/amd_hip_windows.go b/gpu/amd_hip_windows.go
new file mode 100644
index 00000000..14a6c7d6
--- /dev/null
+++ b/gpu/amd_hip_windows.go
@@ -0,0 +1,141 @@
+package gpu
+
+import (
+	"fmt"
+	"log/slog"
+	"strconv"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/windows"
+)
+
+const (
+	hipSuccess       = 0
+	hipErrorNoDevice = 100
+)
+
+type hipDevicePropMinimal struct {
+	Name        [256]byte
+	unused1     [140]byte
+	GcnArchName [256]byte // gfx####
+	iGPU        int       // Doesn't seem to actually report correctly
+	unused2     [128]byte
+}
+
+// Wrap the amdhip64.dll library for GPU discovery
+type HipLib struct {
+	dll                    windows.Handle
+	hipGetDeviceCount      uintptr
+	hipGetDeviceProperties uintptr
+	hipMemGetInfo          uintptr
+	hipSetDevice           uintptr
+	hipDriverGetVersion    uintptr
+}
+
+func NewHipLib() (*HipLib, error) {
+	h, err := windows.LoadLibrary("amdhip64.dll")
+	if err != nil {
+		return nil, fmt.Errorf("unable to load amdhip64.dll: %w", err)
+	}
+	hl := &HipLib{}
+	hl.dll = h
+	hl.hipGetDeviceCount, err = windows.GetProcAddress(hl.dll, "hipGetDeviceCount")
+	if err != nil {
+		return nil, err
+	}
+	hl.hipGetDeviceProperties, err = windows.GetProcAddress(hl.dll, "hipGetDeviceProperties")
+	if err != nil {
+		return nil, err
+	}
+	hl.hipMemGetInfo, err = windows.GetProcAddress(hl.dll, "hipMemGetInfo")
+	if err != nil {
+		return nil, err
+	}
+	hl.hipSetDevice, err = windows.GetProcAddress(hl.dll, "hipSetDevice")
+	if err != nil {
+		return nil, err
+	}
+	hl.hipDriverGetVersion, err = windows.GetProcAddress(hl.dll, "hipDriverGetVersion")
+	if err != nil {
+		return nil, err
+	}
+	return hl, nil
+}
+
+// The hip library only evaluates the HIP_VISIBLE_DEVICES variable at startup
+// so we have to unload/reset the library after we do our initial discovery
+// to make sure our updates to that variable are processed by llama.cpp
+func (hl *HipLib) Release() {
+	err := windows.FreeLibrary(hl.dll)
+	if err != nil {
+		slog.Warn(fmt.Sprintf("failed to unload amdhip64.dll: %s", err))
+	}
+	hl.dll = 0
+}
+
+func (hl *HipLib) AMDDriverVersion() (string, error) {
+	if hl.dll == 0 {
+		return "", fmt.Errorf("dll has been unloaded")
+	}
+	var version int
+	status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
+	if status != hipSuccess {
+		return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
+	}
+	return strconv.Itoa(version), nil
+}
+
+func (hl *HipLib) HipGetDeviceCount() int {
+	if hl.dll == 0 {
+		slog.Error("dll has been unloaded")
+		return 0
+	}
+	var count int
+	status, _, err := syscall.SyscallN(hl.hipGetDeviceCount, uintptr(unsafe.Pointer(&count)))
+	if status == hipErrorNoDevice {
+		slog.Info("AMD ROCm reports no devices found")
+		return 0
+	}
+	if status != hipSuccess {
+		slog.Warn(fmt.Sprintf("failed call to hipGetDeviceCount: %d %s", status, err))
+	}
+	return count
+}
+
+func (hl *HipLib) HipSetDevice(device int) error {
+	if hl.dll == 0 {
+		return fmt.Errorf("dll has been unloaded")
+	}
+	status, _, err := syscall.SyscallN(hl.hipSetDevice, uintptr(device))
+	if status != hipSuccess {
+		return fmt.Errorf("failed call to hipSetDevice: %d %s", status, err)
+	}
+	return nil
+}
+
+func (hl *HipLib) HipGetDeviceProperties(device int) (*hipDevicePropMinimal, error) {
+	if hl.dll == 0 {
+		return nil, fmt.Errorf("dll has been unloaded")
+	}
+	var props hipDevicePropMinimal
+	status, _, err := syscall.SyscallN(hl.hipGetDeviceProperties, uintptr(unsafe.Pointer(&props)), uintptr(device))
+	if status != hipSuccess {
+		return nil, fmt.Errorf("failed call to hipGetDeviceProperties: %d %s", status, err)
+	}
+	return &props, nil
+}
+
+// free, total, err
+func (hl *HipLib) HipMemGetInfo() (uint64, uint64, error) {
+	if hl.dll == 0 {
+		return 0, 0, fmt.Errorf("dll has been unloaded")
+	}
+	var totalMemory uint64
+	var freeMemory uint64
+	status, _, err := syscall.SyscallN(hl.hipMemGetInfo, uintptr(unsafe.Pointer(&freeMemory)), uintptr(unsafe.Pointer(&totalMemory)))
+	if status != hipSuccess {
+		return 0, 0, fmt.Errorf("failed call to hipMemGetInfo: %d %s", status, err)
+	}
+	return freeMemory, totalMemory, nil
+}
diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go
new file mode 100644
index 00000000..c775b71d
--- /dev/null
+++ b/gpu/amd_linux.go
@@ -0,0 +1,411 @@
+package gpu
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"slices"
+	"strconv"
+	"strings"
+
+	"github.com/jmorganca/ollama/version"
+)
+
+// Discovery logic for AMD/ROCm GPUs
+
+const (
+	curlMsg               = "curl -fsSL https://github.com/ollama/ollama/releases/download/v%s/rocm-amd64-deps.tgz | tar -zxf - -C %s"
+	DriverVersionFile     = "/sys/module/amdgpu/version"
+	AMDNodesSysfsDir      = "/sys/class/kfd/kfd/topology/nodes/"
+	GPUPropertiesFileGlob = AMDNodesSysfsDir + "*/properties"
+
+	// Prefix with the node dir
+	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
+	GPUUsedMemoryFileGlob  = "mem_banks/*/used_memory"
+	RocmStandardLocation   = "/opt/rocm/lib"
+)
+
+var (
+	// Used to validate if the given ROCm lib is usable
+	ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
+)
+
+// Gather GPU information from the amdgpu driver if any supported GPUs are detected
+// HIP_VISIBLE_DEVICES will be set if we detect a mix of unsupported and supported devices
+// and the user hasn't already set this variable
+func AMDGetGPUInfo(resp *GpuInfo) {
+	// TODO - DRY this out with windows
+	if !AMDDetected() {
+		return
+	}
+	skip := map[int]interface{}{}
+
+	// Opportunistic logging of driver version to aid in troubleshooting
+	ver, err := AMDDriverVersion()
+	if err == nil {
+		slog.Info("AMD Driver: " + ver)
+	} else {
+		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
+		slog.Warn(fmt.Sprintf("ollama recommends running the https://www.amd.com/en/support/linux-drivers: %s", err))
+	}
+
+	// If the user has specified exactly which GPUs to use, look up their memory
+	visibleDevices := os.Getenv("HIP_VISIBLE_DEVICES")
+	if visibleDevices != "" {
+		ids := []int{}
+		for _, idStr := range strings.Split(visibleDevices, ",") {
+			id, err := strconv.Atoi(idStr)
+			if err != nil {
+				slog.Warn(fmt.Sprintf("malformed HIP_VISIBLE_DEVICES=%s %s", visibleDevices, err))
+			} else {
+				ids = append(ids, id)
+			}
+		}
+		amdProcMemLookup(resp, nil, ids)
+		return
+	}
+
+	// Gather GFX version information from all detected cards
+	gfx := AMDGFXVersions()
+	verStrings := []string{}
+	for i, v := range gfx {
+		verStrings = append(verStrings, v.ToGFXString())
+		if v.Major == 0 {
+			// Silently skip CPUs
+			skip[i] = struct{}{}
+			continue
+		}
+		if v.Major < 9 {
+			// TODO consider this a build-time setting if we can support 8xx family GPUs
+			slog.Warn(fmt.Sprintf("amdgpu [%d] too old %s", i, v.ToGFXString()))
+			skip[i] = struct{}{}
+		}
+	}
+	slog.Info(fmt.Sprintf("detected amdgpu versions %v", verStrings))
+
+	// Abort if all GPUs are skipped
+	if len(skip) >= len(gfx) {
+		slog.Info("all detected amdgpus are skipped, falling back to CPU")
+		return
+	}
+
+	// If we got this far, then we have at least 1 GPU that's a ROCm candidate, so make sure we have a lib
+	libDir, err := AMDValidateLibDir()
+	if err != nil {
+		slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
+		return
+	}
+
+	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
+	if gfxOverride == "" {
+		supported, err := GetSupportedGFX(libDir)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
+			return
+		}
+		slog.Debug(fmt.Sprintf("rocm supported GPU types %v", supported))
+
+		for i, v := range gfx {
+			if !slices.Contains[[]string, string](supported, v.ToGFXString()) {
+				slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, v.ToGFXString(), libDir, supported))
+				// TODO - consider discrete markdown just for ROCM troubleshooting?
+				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
+				skip[i] = struct{}{}
+			} else {
+				slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, v.ToGFXString()))
+			}
+		}
+	} else {
+		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
+	}
+
+	if len(skip) >= len(gfx) {
+		slog.Info("all detected amdgpus are skipped, falling back to CPU")
+		return
+	}
+
+	ids := make([]int, len(gfx))
+	i := 0
+	for k := range gfx {
+		ids[i] = k
+		i++
+	}
+	amdProcMemLookup(resp, skip, ids)
+	if resp.memInfo.DeviceCount == 0 {
+		return
+	}
+	if len(skip) > 0 {
+		amdSetVisibleDevices(ids, skip)
+	}
+}
+
+// Walk the sysfs nodes for the available GPUs and gather information from them
+// skipping over any devices in the skip map
+func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
+	resp.memInfo.DeviceCount = 0
+	resp.memInfo.TotalMemory = 0
+	resp.memInfo.FreeMemory = 0
+	if len(ids) == 0 {
+		slog.Debug("discovering all amdgpu devices")
+		entries, err := os.ReadDir(AMDNodesSysfsDir)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
+			return
+		}
+		for _, node := range entries {
+			if !node.IsDir() {
+				continue
+			}
+			id, err := strconv.Atoi(node.Name())
+			if err != nil {
+				slog.Warn("malformed amdgpu sysfs node id " + node.Name())
+				continue
+			}
+			ids = append(ids, id)
+		}
+	}
+	slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids))
+
+	for _, id := range ids {
+		if _, skipped := skip[id]; skipped {
+			continue
+		}
+		totalMemory := uint64(0)
+		usedMemory := uint64(0)
+		propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob)
+		propFiles, err := filepath.Glob(propGlob)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
+		}
+		// 1 or more memory banks - sum the values of all of them
+		for _, propFile := range propFiles {
+			fp, err := os.Open(propFile)
+			if err != nil {
+				slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", propFile, err))
+				continue
+			}
+			defer fp.Close()
+			scanner := bufio.NewScanner(fp)
+			for scanner.Scan() {
+				line := strings.TrimSpace(scanner.Text())
+				if strings.HasPrefix(line, "size_in_bytes") {
+					ver := strings.Fields(line)
+					if len(ver) != 2 {
+						slog.Warn("malformed " + line)
+						continue
+					}
+					bankSizeInBytes, err := strconv.ParseUint(ver[1], 10, 64)
+					if err != nil {
+						slog.Warn("malformed int " + line)
+						continue
+					}
+					totalMemory += bankSizeInBytes
+				}
+			}
+		}
+		if totalMemory == 0 {
+			continue
+		}
+		usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
+		usedFiles, err := filepath.Glob(usedGlob)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("error looking up used GPU memory: %s %s", usedGlob, err))
+			continue
+		}
+		for _, usedFile := range usedFiles {
+			fp, err := os.Open(usedFile)
+			if err != nil {
+				slog.Warn(fmt.Sprintf("failed to open sysfs node file %s: %s", usedFile, err))
+				continue
+			}
+			defer fp.Close()
+			data, err := io.ReadAll(fp)
+			if err != nil {
+				slog.Warn(fmt.Sprintf("failed to read sysfs node file %s: %s", usedFile, err))
+				continue
+			}
+			used, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
+			if err != nil {
+				slog.Warn(fmt.Sprintf("malformed used memory %s: %s", string(data), err))
+				continue
+			}
+			usedMemory += used
+		}
+		slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory))
+		slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory  %d", id, (totalMemory - usedMemory)))
+		resp.memInfo.DeviceCount++
+		resp.memInfo.TotalMemory += totalMemory
+		resp.memInfo.FreeMemory += (totalMemory - usedMemory)
+	}
+	if resp.memInfo.DeviceCount > 0 {
+		resp.Library = "rocm"
+	}
+}
+
+// Quick check for AMD driver so we can skip amdgpu discovery if not present
+func AMDDetected() bool {
+	// Some driver versions (older?) don't have a version file, so just lookup the parent dir
+	sysfsDir := filepath.Dir(DriverVersionFile)
+	_, err := os.Stat(sysfsDir)
+	if errors.Is(err, os.ErrNotExist) {
+		slog.Debug("amdgpu driver not detected " + sysfsDir)
+		return false
+	} else if err != nil {
+		slog.Debug(fmt.Sprintf("error looking up amd driver %s %s", sysfsDir, err))
+		return false
+	}
+	return true
+}
+
+func setupLink(source, target string) error {
+	if err := os.RemoveAll(target); err != nil {
+		return fmt.Errorf("failed to remove old rocm directory %s %w", target, err)
+	}
+	if err := os.Symlink(source, target); err != nil {
+		return fmt.Errorf("failed to create link %s => %s %w", source, target, err)
+	}
+	slog.Debug(fmt.Sprintf("host rocm linked %s => %s", source, target))
+	return nil
+}
+
+// Ensure the AMD rocm lib dir is wired up
+// Prefer to use host installed ROCm, as long as it meets our minimum requirements
+// failing that, tell the user how to download it on their own
+func AMDValidateLibDir() (string, error) {
+	// We rely on the rpath compiled into our library to find rocm
+	// so we establish a symlink to wherever we find it on the system
+	// to $AssetsDir/rocm
+
+	// If we already have a rocm dependency wired, nothing more to do
+	assetsDir, err := AssetsDir()
+	if err != nil {
+		return "", fmt.Errorf("unable to lookup lib dir: %w", err)
+	}
+	// Versioned directory
+	rocmTargetDir := filepath.Join(assetsDir, "rocm")
+	if rocmLibUsable(rocmTargetDir) {
+		return rocmTargetDir, nil
+	}
+	// Parent dir (unversioned)
+	commonRocmDir := filepath.Join(filepath.Dir(assetsDir), "rocm")
+	if rocmLibUsable(commonRocmDir) {
+		return rocmTargetDir, setupLink(commonRocmDir, rocmTargetDir)
+	}
+
+	// Prefer explicit HIP env var
+	hipPath := os.Getenv("HIP_PATH")
+	if hipPath != "" {
+		hipLibDir := filepath.Join(hipPath, "lib")
+		if rocmLibUsable(hipLibDir) {
+			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
+			return rocmTargetDir, setupLink(hipLibDir, rocmTargetDir)
+		}
+	}
+
+	// Scan the library path for potential matches
+	ldPaths := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
+	for _, ldPath := range ldPaths {
+		d, err := filepath.Abs(ldPath)
+		if err != nil {
+			continue
+		}
+		if rocmLibUsable(d) {
+			return rocmTargetDir, setupLink(d, rocmTargetDir)
+		}
+	}
+
+	// Well known location(s)
+	if rocmLibUsable("/opt/rocm/lib") {
+		return rocmTargetDir, setupLink("/opt/rocm/lib", rocmTargetDir)
+	}
+	err = os.MkdirAll(rocmTargetDir, 0755)
+	if err != nil {
+		return "", fmt.Errorf("failed to create empty rocm dir %s %w", rocmTargetDir, err)
+	}
+
+	// If we still haven't found a usable rocm, the user will have to download it on their own
+	slog.Warn("amdgpu detected, but no compatible rocm library found.  Either install rocm v6, or run the following")
+	slog.Warn(fmt.Sprintf(curlMsg, version.Version, rocmTargetDir))
+	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
+}
+
+func AMDDriverVersion() (string, error) {
+	_, err := os.Stat(DriverVersionFile)
+	if err != nil {
+		return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
+	}
+	fp, err := os.Open(DriverVersionFile)
+	if err != nil {
+		return "", err
+	}
+	defer fp.Close()
+	verString, err := io.ReadAll(fp)
+	if err != nil {
+		return "", err
+	}
+	return strings.TrimSpace(string(verString)), nil
+}
+
+func AMDGFXVersions() map[int]Version {
+	res := map[int]Version{}
+	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
+	for _, match := range matches {
+		fp, err := os.Open(match)
+		if err != nil {
+			slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
+			continue
+		}
+		defer fp.Close()
+		i, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
+		if err != nil {
+			slog.Debug(fmt.Sprintf("failed to parse node ID %s", err))
+			continue
+		}
+
+		scanner := bufio.NewScanner(fp)
+		for scanner.Scan() {
+			line := strings.TrimSpace(scanner.Text())
+			if strings.HasPrefix(line, "gfx_target_version") {
+				ver := strings.Fields(line)
+				if len(ver) != 2 || len(ver[1]) < 5 {
+
+					if ver[1] == "0" {
+						// Silently skip the CPU
+						continue
+					} else {
+						slog.Debug("malformed " + line)
+					}
+					res[i] = Version{
+						Major: 0,
+						Minor: 0,
+						Patch: 0,
+					}
+					continue
+				}
+				l := len(ver[1])
+				patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
+				minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
+				major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
+				if err1 != nil || err2 != nil || err3 != nil {
+					slog.Debug("malformed int " + line)
+					continue
+				}
+
+				res[i] = Version{
+					Major: uint(major),
+					Minor: uint(minor),
+					Patch: uint(patch),
+				}
+			}
+		}
+	}
+	return res
+}
+
+func (v Version) ToGFXString() string {
+	return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
+}
diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go
new file mode 100644
index 00000000..5a965482
--- /dev/null
+++ b/gpu/amd_windows.go
@@ -0,0 +1,190 @@
+package gpu
+
+import (
+	"bytes"
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"slices"
+	"strings"
+)
+
+const (
+	RocmStandardLocation = "C:\\Program Files\\AMD\\ROCm\\5.7\\bin" // TODO glob?
+
+	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
+	iGPUName = "AMD Radeon(TM) Graphics"
+)
+
+var (
+	// Used to validate if the given ROCm lib is usable
+	ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
+)
+
+func AMDGetGPUInfo(resp *GpuInfo) {
+	hl, err := NewHipLib()
+	if err != nil {
+		slog.Debug(err.Error())
+		return
+	}
+	defer hl.Release()
+	skip := map[int]interface{}{}
+	ids := []int{}
+	resp.memInfo.DeviceCount = 0
+	resp.memInfo.TotalMemory = 0
+	resp.memInfo.FreeMemory = 0
+
+	ver, err := hl.AMDDriverVersion()
+	if err == nil {
+		slog.Info("AMD Driver: " + ver)
+	} else {
+		// For now this is benign, but we may eventually need to fail compatibility checks
+		slog.Debug(fmt.Sprintf("error looking up amd driver version: %s", err))
+	}
+
+	// Note: the HIP library automatically handles HIP_VISIBLE_DEVICES
+	count := hl.HipGetDeviceCount()
+	if count == 0 {
+		return
+	}
+	libDir, err := AMDValidateLibDir()
+	if err != nil {
+		slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
+		return
+	}
+
+	var supported []string
+	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
+	if gfxOverride == "" {
+		supported, err = GetSupportedGFX(libDir)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
+			return
+		}
+	} else {
+		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
+	}
+
+	slog.Info(fmt.Sprintf("detected %d hip devices", count))
+	for i := 0; i < count; i++ {
+		ids = append(ids, i)
+		err = hl.HipSetDevice(i)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
+			skip[i] = struct{}{}
+			continue
+		}
+
+		props, err := hl.HipGetDeviceProperties(i)
+		if err != nil {
+			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
+			skip[i] = struct{}{}
+			continue
+		}
+		n := bytes.IndexByte(props.Name[:], 0)
+		name := string(props.Name[:n])
+		slog.Info(fmt.Sprintf("[%d] Name: %s", i, name))
+		n = bytes.IndexByte(props.GcnArchName[:], 0)
+		gfx := string(props.GcnArchName[:n])
+		slog.Info(fmt.Sprintf("[%d] GcnArchName: %s", i, gfx))
+		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
+		// TODO  Why isn't props.iGPU accurate!?
+		if strings.EqualFold(name, iGPUName) {
+			slog.Info(fmt.Sprintf("iGPU detected [%d] skipping", i))
+			skip[i] = struct{}{}
+			continue
+		}
+		if gfxOverride == "" {
+			if !slices.Contains[[]string, string](supported, gfx) {
+				slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, gfx, libDir, supported))
+				// TODO - consider discrete markdown just for ROCM troubleshooting?
+				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
+				skip[i] = struct{}{}
+				continue
+			} else {
+				slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, gfx))
+			}
+		}
+
+		totalMemory, freeMemory, err := hl.HipMemGetInfo()
+		if err != nil {
+			slog.Warn(fmt.Sprintf("[%d] %s", i, err))
+			continue
+		}
+
+		// TODO according to docs, freeMem may lie on windows!
+		slog.Info(fmt.Sprintf("[%d] Total Mem: %d", i, totalMemory))
+		slog.Info(fmt.Sprintf("[%d] Free Mem:  %d", i, freeMemory))
+		resp.memInfo.DeviceCount++
+		resp.memInfo.TotalMemory += totalMemory
+		resp.memInfo.FreeMemory += freeMemory
+	}
+	if resp.memInfo.DeviceCount > 0 {
+		resp.Library = "rocm"
+	}
+	// Abort if all GPUs are skipped
+	if len(skip) >= count {
+		slog.Info("all detected amdgpus are skipped, falling back to CPU")
+		return
+	}
+	if len(skip) > 0 {
+		amdSetVisibleDevices(ids, skip)
+	}
+	UpdatePath(libDir)
+}
+
+func AMDValidateLibDir() (string, error) {
+	// On windows non-admins typically can't create links
+	// so instead of trying to rely on rpath and a link in
+	// $LibDir/rocm, we instead rely on setting PATH to point
+	// to the location of the ROCm library
+
+	// Installer payload location
+	exe, err := os.Executable()
+	if err == nil {
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
+		if rocmLibUsable(rocmTargetDir) {
+			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
+			return rocmTargetDir, nil
+		}
+	}
+
+	// If we already have a rocm dependency wired, nothing more to do
+	libDir, err := AssetsDir()
+	if err != nil {
+		return "", fmt.Errorf("unable to lookup lib dir: %w", err)
+	}
+	rocmTargetDir := filepath.Join(libDir, "rocm")
+	if rocmLibUsable(rocmTargetDir) {
+		return rocmTargetDir, nil
+	}
+
+	// Prefer explicit HIP env var
+	hipPath := os.Getenv("HIP_PATH")
+	if hipPath != "" {
+		hipLibDir := filepath.Join(hipPath, "bin")
+		if rocmLibUsable(hipLibDir) {
+			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
+			return hipLibDir, nil
+		}
+	}
+
+	// Well known location(s)
+	if rocmLibUsable(RocmStandardLocation) {
+		return RocmStandardLocation, nil
+	}
+
+	// Installer payload (if we're running from some other location)
+	localAppData := os.Getenv("LOCALAPPDATA")
+	appDir := filepath.Join(localAppData, "Programs", "Ollama")
+	rocmTargetDir = filepath.Join(appDir, "rocm")
+	if rocmLibUsable(rocmTargetDir) {
+		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
+		return rocmTargetDir, nil
+	}
+
+	// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
+	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm v6")
+	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
+}
diff --git a/gpu/assets.go b/gpu/assets.go
new file mode 100644
index 00000000..41d0046a
--- /dev/null
+++ b/gpu/assets.go
@@ -0,0 +1,60 @@
+package gpu
+
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+
+	"github.com/jmorganca/ollama/version"
+)
+
+func AssetsDir() (string, error) {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", err
+	}
+	baseDir := filepath.Join(home, ".ollama", "assets")
+	libDirs, err := os.ReadDir(baseDir)
+	if err == nil {
+		for _, d := range libDirs {
+			if d.Name() == version.Version {
+				continue
+			}
+			// Special case the rocm dependencies, which are handled by the installer
+			if d.Name() == "rocm" {
+				continue
+			}
+			slog.Debug("stale lib detected, cleaning up " + d.Name())
+			err = os.RemoveAll(filepath.Join(baseDir, d.Name()))
+			if err != nil {
+				slog.Warn(fmt.Sprintf("unable to clean up stale library %s: %s", filepath.Join(baseDir, d.Name()), err))
+			}
+		}
+	}
+	return filepath.Join(baseDir, version.Version), nil
+}
+
+func UpdatePath(dir string) {
+	if runtime.GOOS == "windows" {
+		tmpDir := filepath.Dir(dir)
+		pathComponents := strings.Split(os.Getenv("PATH"), ";")
+		i := 0
+		for _, comp := range pathComponents {
+			if strings.EqualFold(comp, dir) {
+				return
+			}
+			// Remove any other prior paths to our temp dir
+			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
+				pathComponents[i] = comp
+				i++
+			}
+		}
+		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
+		slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
+		os.Setenv("PATH", newPath)
+	}
+	// linux and darwin rely on rpath
+}
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 8c7f1297..e0c18e26 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -24,7 +24,6 @@ import (
 
 type handles struct {
 	cuda *C.cuda_handle_t
-	rocm *C.rocm_handle_t
 }
 
 var gpuMutex sync.Mutex
@@ -54,39 +53,23 @@ var CudaWindowsGlobs = []string{
 	"c:\\Windows\\System32\\nvml.dll",
 }
 
-var RocmLinuxGlobs = []string{
-	"/opt/rocm*/lib*/librocm_smi64.so*",
-}
-
-var RocmWindowsGlobs = []string{
-	"c:\\Windows\\System32\\rocm_smi64.dll",
-}
-
 // Note: gpuMutex must already be held
 func initGPUHandles() {
 
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
 
-	gpuHandles = &handles{nil, nil}
+	gpuHandles = &handles{nil}
 	var cudaMgmtName string
 	var cudaMgmtPatterns []string
-	var rocmMgmtName string
-	var rocmMgmtPatterns []string
 	switch runtime.GOOS {
 	case "windows":
 		cudaMgmtName = "nvml.dll"
 		cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
 		copy(cudaMgmtPatterns, CudaWindowsGlobs)
-		rocmMgmtName = "rocm_smi64.dll"
-		rocmMgmtPatterns = make([]string, len(RocmWindowsGlobs))
-		copy(rocmMgmtPatterns, RocmWindowsGlobs)
 	case "linux":
 		cudaMgmtName = "libnvidia-ml.so"
 		cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
 		copy(cudaMgmtPatterns, CudaLinuxGlobs)
-		rocmMgmtName = "librocm_smi64.so"
-		rocmMgmtPatterns = make([]string, len(RocmLinuxGlobs))
-		copy(rocmMgmtPatterns, RocmLinuxGlobs)
 	default:
 		return
 	}
@@ -101,16 +84,6 @@ func initGPUHandles() {
 			return
 		}
 	}
-
-	rocmLibPaths := FindGPULibs(rocmMgmtName, rocmMgmtPatterns)
-	if len(rocmLibPaths) > 0 {
-		rocm := LoadROCMMgmt(rocmLibPaths)
-		if rocm != nil {
-			slog.Info("Radeon GPU detected")
-			gpuHandles.rocm = rocm
-			return
-		}
-	}
 }
 
 func GetGPUInfo() GpuInfo {
@@ -149,66 +122,10 @@ func GetGPUInfo() GpuInfo {
 				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
-	} else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
-		ver, err := AMDDriverVersion()
-		if err == nil {
-			slog.Info("AMD Driver: " + ver)
-		} else {
-			// For now this is benign, but we may eventually need to fail compatibility checks
-			slog.Debug("error looking up amd driver version: %s", err)
-		}
-		gfx := AMDGFXVersions()
-		tooOld := false
-		for _, v := range gfx {
-			if v.Major < 9 {
-				slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
-				tooOld = true
-				break
-			}
-
-			// TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
-			// e.g. gfx1034 works if we map it to gfx1030 at runtime
-
-		}
-		if !tooOld {
-			// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
-			C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
-			if memInfo.err != nil {
-				slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
-				C.free(unsafe.Pointer(memInfo.err))
-			} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
-				// Only one GPU detected and it appears to be an integrated GPU - skip it
-				slog.Info("ROCm unsupported integrated GPU detected")
-			} else if memInfo.count > 0 {
-				if memInfo.igpu_index >= 0 {
-					// We have multiple GPUs reported, and one of them is an integrated GPU
-					// so we have to set the env var to bypass it
-					// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
-					val := os.Getenv("ROCR_VISIBLE_DEVICES")
-					if val == "" {
-						devices := []string{}
-						for i := 0; i < int(memInfo.count); i++ {
-							if i == int(memInfo.igpu_index) {
-								continue
-							}
-							devices = append(devices, strconv.Itoa(i))
-						}
-						val = strings.Join(devices, ",")
-						os.Setenv("ROCR_VISIBLE_DEVICES", val)
-					}
-					slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
-				}
-				resp.Library = "rocm"
-				var version C.rocm_version_resp_t
-				C.rocm_get_version(*gpuHandles.rocm, &version)
-				verString := C.GoString(version.str)
-				if version.status == 0 {
-					resp.Variant = "v" + verString
-				} else {
-					slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
-				}
-				C.free(unsafe.Pointer(version.str))
-			}
+	} else {
+		AMDGetGPUInfo(&resp)
+		if resp.Library != "" {
+			return resp
 		}
 	}
 	if resp.Library == "" {
@@ -338,23 +255,6 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
 	return nil
 }
 
-func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
-	var resp C.rocm_init_resp_t
-	resp.rh.verbose = getVerboseState()
-	for _, libPath := range rocmLibPaths {
-		lib := C.CString(libPath)
-		defer C.free(unsafe.Pointer(lib))
-		C.rocm_init(lib, &resp)
-		if resp.err != nil {
-			slog.Info(fmt.Sprintf("Unable to load ROCm management library %s: %s", libPath, C.GoString(resp.err)))
-			C.free(unsafe.Pointer(resp.err))
-		} else {
-			return &resp.rh
-		}
-	}
-	return nil
-}
-
 func getVerboseState() C.uint16_t {
 	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
 		return C.uint16_t(1)
diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h
index e52d2066..8186a3f0 100644
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -53,7 +53,6 @@ void cpu_check_ram(mem_info_t *resp);
 #endif
 
 #include "gpu_info_cuda.h"
-#include "gpu_info_rocm.h"
 
 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
\ No newline at end of file
diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c
index a64b4587..509bf5c6 100644
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -124,31 +124,31 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
       // When in verbose mode, report more information about
       // the card we discover, but don't fail on error
       ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
+      if (ret != NVML_SUCCESS) {
         LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
       } else {
         LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
       }
       ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
+      if (ret != NVML_SUCCESS) {
         LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
       } else {
         LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
       }
       ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
+      if (ret != NVML_SUCCESS) {
         LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
       } else {
         LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
       }
       ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
+      if (ret != NVML_SUCCESS) {
         LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
       } else {
         LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
       }
       ret = (*h.nvmlDeviceGetBrand)(device, &brand);
-      if (ret != RSMI_STATUS_SUCCESS) {
+      if (ret != NVML_SUCCESS) {
         LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
       } else {
         LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c
deleted file mode 100644
index 7ac88611..00000000
--- a/gpu/gpu_info_rocm.c
+++ /dev/null
@@ -1,198 +0,0 @@
-#ifndef __APPLE__
-
-#include "gpu_info_rocm.h"
-
-#include <string.h>
-
-void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
-  rsmi_status_t ret;
-  resp->err = NULL;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-      {"rsmi_init", (void *)&resp->rh.rsmi_init},
-      {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
-      {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
-      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
-      {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
-      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
-      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
-      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
-      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
-      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
-      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
-      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
-      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
-      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
-      {NULL, NULL},
-  };
-
-  resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
-  if (!resp->rh.handle) {
-    char *msg = LOAD_ERR();
-    snprintf(buf, buflen,
-             "Unable to load %s library to query for Radeon GPUs: %s\n",
-             rocm_lib_path, msg);
-    free(msg);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  // TODO once we've squashed the remaining corner cases remove this log
-  LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
-
-  for (i = 0; l[i].s != NULL; i++) {
-    // TODO once we've squashed the remaining corner cases remove this log
-    LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
-
-    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
-    if (!l[i].p) {
-      resp->rh.handle = NULL;
-      char *msg = LOAD_ERR();
-      LOG(resp->rh.verbose, "dlerr: %s\n", msg);
-      UNLOAD_LIBRARY(resp->rh.handle);
-      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
-               msg);
-      free(msg);
-      resp->err = strdup(buf);
-      return;
-    }
-  }
-
-  ret = (*resp->rh.rsmi_init)(0);
-  if (ret != RSMI_STATUS_SUCCESS) {
-    LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
-    UNLOAD_LIBRARY(resp->rh.handle);
-    resp->rh.handle = NULL;
-    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
-    resp->err = strdup(buf);
-  }
-
-  return;
-}
-
-void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
-  resp->err = NULL;
-  resp->igpu_index = -1;
-  uint64_t totalMem = 0;
-  uint64_t usedMem = 0;
-  rsmi_status_t ret;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-
-  if (h.handle == NULL) {
-    resp->err = strdup("rocm handle not initialized");
-    return;
-  }
-
-  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
-  if (ret != RSMI_STATUS_SUCCESS) {
-    snprintf(buf, buflen, "unable to get device count: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
-
-  resp->total = 0;
-  resp->free = 0;
-  for (i = 0; i < resp->count; i++) {
-    if (h.verbose) {
-      // When in verbose mode, report more information about
-      // the card we discover, but don't fail on error
-      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
-      }
-      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
-      }
-      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
-      }
-      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
-      }
-      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
-      }
-      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
-      }
-      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
-      if (ret != RSMI_STATUS_SUCCESS) {
-        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
-      } else {
-        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
-      }
-    }
-
-    // Get total memory - used memory for available memory
-    ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
-    if (ret != RSMI_STATUS_SUCCESS) {
-      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
-      resp->err = strdup(buf);
-      return;
-    }
-    ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
-    if (ret != RSMI_STATUS_SUCCESS) {
-      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
-      resp->err = strdup(buf);
-      return;
-    }
-    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
-    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
-    if (totalMem < 1024 * 1024 * 1024) {
-      // Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
-      LOG(h.verbose, "[%d] ROCm integrated GPU\n", i);
-      resp->igpu_index = i;
-    } else {
-      resp->total += totalMem;
-      resp->free += totalMem - usedMem;
-    }
-  }
-}
-
-void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
-  const int buflen = 256;
-  char buf[buflen + 1];
-  if (h.handle == NULL) {
-    resp->str = strdup("rocm handle not initialized");
-    resp->status = 1;
-    return;
-  }
-  rsmi_version_t ver;
-  rsmi_status_t ret;
-  ret = h.rsmi_version_get(&ver);
-  if (ret != RSMI_STATUS_SUCCESS) {
-    snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
-    resp->status = 1;
-  } else {
-    snprintf(buf, buflen, "%d", ver.major);
-    resp->status = 0;
-  }
-  resp->str = strdup(buf);
-}
-
-#endif  // __APPLE__
diff --git a/gpu/gpu_info_rocm.h b/gpu/gpu_info_rocm.h
deleted file mode 100644
index 0a8d50c0..00000000
--- a/gpu/gpu_info_rocm.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef __APPLE__
-#ifndef __GPU_INFO_ROCM_H__
-#define __GPU_INFO_ROCM_H__
-#include "gpu_info.h"
-
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum rsmi_status_return {
-  RSMI_STATUS_SUCCESS = 0,
-  // Other values omitted for now...
-} rsmi_status_t;
-
-typedef enum rsmi_memory_type {
-  RSMI_MEM_TYPE_VRAM = 0,
-  RSMI_MEM_TYPE_VIS_VRAM,
-  RSMI_MEM_TYPE_GTT,
-} rsmi_memory_type_t;
-
- typedef struct {
-     uint32_t major;     
-     uint32_t minor;     
-     uint32_t patch;     
-     const char *build;  
- } rsmi_version_t;
-
-typedef struct rocm_handle {
-  void *handle;
-  uint16_t verbose;
-  rsmi_status_t (*rsmi_init)(uint64_t);
-  rsmi_status_t (*rsmi_shut_down)(void);
-  rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
-  rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
-  rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
-  rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
-  rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
-  rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
-  rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);		
-  rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);		
-  rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);		
-  rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);		
-  rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);		
-  rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);		
-} rocm_handle_t;
-
-typedef struct rocm_init_resp {
-  char *err;  // If err is non-null handle is invalid
-  rocm_handle_t rh;
-} rocm_init_resp_t;
-
-typedef struct rocm_version_resp {
-  rsmi_status_t status;
-  char *str; // Contains version or error string if status != 0 
-} rocm_version_resp_t;
-
-void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
-void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
-void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp);
-
-#endif  // __GPU_INFO_ROCM_H__
-#endif  // __APPLE__
\ No newline at end of file
diff --git a/llm/dyn_ext_server.c b/llm/dyn_ext_server.c
index 47dc4e99..dab49f85 100644
--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@@ -14,17 +14,14 @@
 #define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
 #define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
 #define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
-inline char *LOAD_ERR() {
-  LPSTR messageBuffer = NULL;
-  size_t size = FormatMessageA(
-      FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
-          FORMAT_MESSAGE_IGNORE_INSERTS,
-      NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
-      (LPSTR)&messageBuffer, 0, NULL);
-  char *resp = strdup(messageBuffer);
-  LocalFree(messageBuffer);
-  return resp;
-}
+#define LOAD_ERR() ({\
+  LPSTR messageBuffer = NULL; \
+  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
+                                 NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
+  char *resp = strdup(messageBuffer); \
+  LocalFree(messageBuffer); \
+  resp; \
+})
 #else
 #include <dlfcn.h>
 #define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
diff --git a/llm/dyn_ext_server.go b/llm/dyn_ext_server.go
index 8d7ebf9e..fa5a3477 100644
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -28,13 +28,13 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
-	"runtime"
 	"strings"
 	"sync"
 	"time"
 	"unsafe"
 
 	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/gpu"
 )
 
 type dynExtServer struct {
@@ -72,7 +72,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
 		mutex.Lock()
 	}
-	updatePath(filepath.Dir(library))
+	gpu.UpdatePath(filepath.Dir(library))
 	libPath := C.CString(library)
 	defer C.free(unsafe.Pointer(libPath))
 	resp := newExtServerResp(512)
@@ -148,6 +148,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 	}
 
 	slog.Info("Initializing llama server")
+	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
 	initResp := newExtServerResp(128)
 	defer freeExtServerResp(initResp)
 	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
@@ -365,25 +366,3 @@ func (llm *dynExtServer) Close() {
 	C.dyn_llama_server_stop(llm.s)
 	mutex.Unlock()
 }
-
-func updatePath(dir string) {
-	if runtime.GOOS == "windows" {
-		tmpDir := filepath.Dir(dir)
-		pathComponents := strings.Split(os.Getenv("PATH"), ";")
-		i := 0
-		for _, comp := range pathComponents {
-			if strings.EqualFold(comp, dir) {
-				return
-			}
-			// Remove any other prior paths to our temp dir
-			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
-				pathComponents[i] = comp
-				i++
-			}
-		}
-		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
-		slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
-		os.Setenv("PATH", newPath)
-	}
-	// linux and darwin rely on rpath
-}
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index e6a7d077..0b8cb344 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -179,17 +179,21 @@ fi
 
 if [ -d "${ROCM_PATH}" ]; then
     echo "ROCm libraries detected - building dynamic ROCm library"
-    if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
-        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
+    if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
+        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
     fi
     init_vars
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
     BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
+    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
     build
 
-    # Note: the ROCM libs and runtime library files are too large to embed, so we depend on
-    #       them being present at runtime on the host
+    # Record the ROCM dependencies
+    rm -f "${BUILD_DIR}/lib/deps.txt"
+    touch "${BUILD_DIR}/lib/deps.txt"
+    for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
+        echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
+    done
     compress_libs
 fi
 
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index e0313420..579b2bca 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -2,19 +2,52 @@
 
 $ErrorActionPreference = "Stop"
 
+function amdGPUs {
+    if ($env:AMDGPU_TARGETS) {
+        return $env:AMDGPU_TARGETS
+    }
+    # TODO - load from some common data file for linux + windows build consistency
+    $GPU_LIST = @(
+        "gfx900"
+        "gfx906:xnack-"
+        "gfx908:xnack-"
+        "gfx90a:xnack+"
+        "gfx90a:xnack-"
+        "gfx1010"
+        "gfx1012"
+        "gfx1030"
+        "gfx1100"
+        "gfx1101"
+        "gfx1102"
+    )
+    $GPU_LIST -join ';'
+}
+
 function init_vars {
+    # Verify the environment is a Developer Shell for MSVC 2019
+    write-host $env:VSINSTALLDIR
+    if (($env:VSINSTALLDIR -eq $null)) {
+        Write-Error "`r`nBUILD ERROR - YOUR DEVELOPMENT ENVIRONMENT IS NOT SET UP CORRECTLY`r`nTo build Ollama you must run from an MSVC Developer Shell`r`nSee .\docs\development.md for instructions to set up your dev environment"
+        exit 1
+    }
     $script:SRC_DIR = $(resolve-path "..\..\")
     $script:llamacppDir = "../llama.cpp"
-    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off",  "-A", "x64")
+    $script:cmakeDefs = @(
+        "-DBUILD_SHARED_LIBS=on",
+        "-DLLAMA_NATIVE=off"
+        )
     $script:cmakeTargets = @("ext_server")
     $script:ARCH = "amd64" # arm not yet supported.
     if ($env:CGO_CFLAGS -contains "-g") {
-        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
+        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
         $script:config = "RelWithDebInfo"
     } else {
-        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off")
+        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off", "-DCMAKE_BUILD_TYPE=Release")
         $script:config = "Release"
     }
+    if ($null -ne $env:CMAKE_SYSTEM_VERSION) {
+        $script:cmakeDefs += @("-DCMAKE_SYSTEM_VERSION=${env:CMAKE_SYSTEM_VERSION}")
+    }
     # Try to find the CUDA dir
     if ($env:CUDA_LIB_DIR -eq $null) {
         $d=(get-command -ea 'silentlycontinue' nvcc).path
@@ -157,7 +190,7 @@ apply_patches
 $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
 
 init_vars
-$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
 $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
 write-host "Building LCD CPU"
 build
@@ -166,7 +199,7 @@ sign
 compress_libs
 
 init_vars
-$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
 $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
 write-host "Building AVX CPU"
 build
@@ -175,7 +208,7 @@ sign
 compress_libs
 
 init_vars
-$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
+$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
 $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
 write-host "Building AVX2 CPU"
 build
@@ -192,18 +225,51 @@ if ($null -ne $script:CUDA_LIB_DIR) {
     }
     init_vars
     $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
-    $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+    $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
+    write-host "Building CUDA"
     build
     install
     sign
     compress_libs
 }
-# TODO - actually implement ROCm support on windows
-$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm"
 
-rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
-md "${script:buildDir}/lib" -ea 0 > $null
-echo $null >> "${script:buildDir}/lib/.generated"
+if ($null -ne $env:HIP_PATH) {
+    $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
+    if ($null -ne $script:ROCM_VERSION) {
+        $script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
+    }
+
+    init_vars
+    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
+    $script:cmakeDefs += @(
+        "-G", "Ninja", 
+        "-DCMAKE_C_COMPILER=clang.exe",
+        "-DCMAKE_CXX_COMPILER=clang++.exe",
+        "-DLLAMA_HIPBLAS=on",
+        "-DLLAMA_AVX=on",
+        "-DLLAMA_AVX2=off",
+        "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
+        "-DAMDGPU_TARGETS=$(amdGPUs)",
+        "-DGPU_TARGETS=$(amdGPUs)"
+        )
+
+    # Make sure the ROCm binary dir is first in the path
+    $env:PATH="$env:HIP_PATH\bin;$env:VSINSTALLDIR\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja;$env:PATH"
+
+    # We have to clobber the LIB var from the developer shell for clang to work properly
+    $env:LIB=""
+
+    write-host "Building ROCm"
+    build
+    # Ninja doesn't prefix with config name
+    ${script:config}=""
+    install
+    if ($null -ne $script:DUMPBIN) {
+        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
+    }
+    sign
+    compress_libs
+}
 
 cleanup
 write-host "`ngo generate completed"
diff --git a/llm/llm.go b/llm/llm.go
index 81bab122..b0ac0f60 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -19,7 +19,7 @@ type LLM interface {
 	Close()
 }
 
-func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -120,15 +120,15 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 
 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
-	return newLlmServer(info, workDir, model, adapters, projectors, opts)
+	return newLlmServer(info, model, adapters, projectors, opts)
 }
 
 // Give any native cgo implementations an opportunity to initialize
-func Init(workdir string) error {
-	return nativeInit(workdir)
+func Init() error {
+	return nativeInit()
 }
 
-func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	dynLibs := getDynLibs(gpuInfo)
 
 	// Check to see if the user has requested a specific library instead of auto-detecting
@@ -147,7 +147,7 @@ func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projecto
 	_, err := os.Stat(dynLibs[0])
 	if err != nil {
 		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
-		err = nativeInit(workDir)
+		err = nativeInit()
 		if err != nil {
 			return nil, err
 		}
diff --git a/llm/payload_common.go b/llm/payload_common.go
index 3958b9f5..ff38b63f 100644
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@@ -103,10 +103,14 @@ func rocmDynLibPresent() bool {
 	return false
 }
 
-func nativeInit(workdir string) error {
+func nativeInit() error {
 	slog.Info("Extracting dynamic libraries...")
+	assetsDir, err := gpu.AssetsDir()
+	if err != nil {
+		return err
+	}
 	if runtime.GOOS == "darwin" {
-		err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
+		err := extractPayloadFiles(assetsDir, "llama.cpp/ggml-metal.metal")
 		if err != nil {
 			if err == payloadMissing {
 				// TODO perhaps consider this a hard failure on arm macs?
@@ -115,10 +119,10 @@ func nativeInit(workdir string) error {
 			}
 			return err
 		}
-		os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
+		os.Setenv("GGML_METAL_PATH_RESOURCES", assetsDir)
 	}
 
-	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/*/lib/*")
+	libs, err := extractDynamicLibs(assetsDir, "llama.cpp/build/*/*/*/lib/*")
 	if err != nil {
 		if err == payloadMissing {
 			slog.Info(fmt.Sprintf("%s", payloadMissing))
@@ -149,17 +153,13 @@ func nativeInit(workdir string) error {
 	return nil
 }
 
-func extractDynamicLibs(workDir, glob string) ([]string, error) {
+func extractDynamicLibs(assetsDir, glob string) ([]string, error) {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return nil, payloadMissing
 	}
 	libs := []string{}
 
-	// TODO consider making this idempotent with some sort of persistent directory (where we store models probably)
-	// and tracking by version so we don't reexpand the files every time
-	// Also maybe consider lazy loading only what is needed
-
 	g := new(errgroup.Group)
 	for _, file := range files {
 		pathComps := strings.Split(file, "/")
@@ -172,14 +172,14 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
 		g.Go(func() error {
 			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
 			// Include the variant in the path to avoid conflicts between multiple server libs
-			targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
+			targetDir := filepath.Join(assetsDir, pathComps[pathComponentCount-3])
 			srcFile, err := libEmbed.Open(file)
 			if err != nil {
 				return fmt.Errorf("read payload %s: %v", file, err)
 			}
 			defer srcFile.Close()
 			if err := os.MkdirAll(targetDir, 0o755); err != nil {
-				return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
+				return fmt.Errorf("create payload lib dir %s: %v", assetsDir, err)
 			}
 			src := io.Reader(srcFile)
 			filename := file
@@ -196,19 +196,13 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
 				libs = append(libs, destFile)
 			}
 
-			_, err = os.Stat(destFile)
-			switch {
-			case errors.Is(err, os.ErrNotExist):
-				destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-				if err != nil {
-					return fmt.Errorf("write payload %s: %v", file, err)
-				}
-				defer destFile.Close()
-				if _, err := io.Copy(destFile, src); err != nil {
-					return fmt.Errorf("copy payload %s: %v", file, err)
-				}
-			case err != nil:
-				return fmt.Errorf("stat payload %s: %v", file, err)
+			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+			if err != nil {
+				return fmt.Errorf("write payload %s: %v", file, err)
+			}
+			defer destFp.Close()
+			if _, err := io.Copy(destFp, src); err != nil {
+				return fmt.Errorf("copy payload %s: %v", file, err)
 			}
 			return nil
 		})
@@ -216,7 +210,7 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
 	return libs, g.Wait()
 }
 
-func extractPayloadFiles(workDir, glob string) error {
+func extractPayloadFiles(assetsDir, glob string) error {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return payloadMissing
@@ -228,8 +222,8 @@ func extractPayloadFiles(workDir, glob string) error {
 			return fmt.Errorf("read payload %s: %v", file, err)
 		}
 		defer srcFile.Close()
-		if err := os.MkdirAll(workDir, 0o755); err != nil {
-			return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
+		if err := os.MkdirAll(assetsDir, 0o755); err != nil {
+			return fmt.Errorf("create payload lib dir %s: %v", assetsDir, err)
 		}
 		src := io.Reader(srcFile)
 		filename := file
@@ -241,20 +235,22 @@ func extractPayloadFiles(workDir, glob string) error {
 			filename = strings.TrimSuffix(filename, ".gz")
 		}
 
-		destFile := filepath.Join(workDir, filepath.Base(filename))
+		destFile := filepath.Join(assetsDir, filepath.Base(filename))
 		_, err = os.Stat(destFile)
 		switch {
 		case errors.Is(err, os.ErrNotExist):
-			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 			if err != nil {
 				return fmt.Errorf("write payload %s: %v", file, err)
 			}
-			defer destFile.Close()
-			if _, err := io.Copy(destFile, src); err != nil {
+			defer destFp.Close()
+			if _, err := io.Copy(destFp, src); err != nil {
 				return fmt.Errorf("copy payload %s: %v", file, err)
 			}
 		case err != nil:
 			return fmt.Errorf("stat payload %s: %v", file, err)
+		case err == nil:
+			slog.Debug("payload already exists: " + destFile)
 		}
 	}
 	return nil
diff --git a/llm/payload_linux.go b/llm/payload_linux.go
index fc366209..276705c7 100644
--- a/llm/payload_linux.go
+++ b/llm/payload_linux.go
@@ -4,5 +4,5 @@ import (
 	"embed"
 )
 
-//go:embed llama.cpp/build/linux/*/*/lib/*.so*
+//go:embed llama.cpp/build/linux/*/*/lib/*
 var libEmbed embed.FS
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index 338dbcd5..77e21d40 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -22,5 +22,6 @@ for TARGETARCH in ${BUILD_ARCH}; do
         .
     docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
     docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
+    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/dist/deps/ ./dist/
     docker rm builder-$TARGETARCH
 done
diff --git a/server/routes.go b/server/routes.go
index 28165721..e5adc345 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -66,8 +66,6 @@ var defaultSessionDuration = 5 * time.Minute
 
 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
 func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
-	workDir := c.GetString("workDir")
-
 	needLoad := loaded.runner == nil || // is there a model loaded?
 		loaded.ModelPath != model.ModelPath || // has the base model changed?
 		!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
@@ -82,7 +80,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
 			loaded.Options = nil
 		}
 
-		llmRunner, err := llm.New(workDir, model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
+		llmRunner, err := llm.New(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
 		if err != nil {
 			// some older models are not compatible with newer versions of llama.cpp
 			// show a generalized compatibility error until there is a better way to
@@ -1035,7 +1033,7 @@ func Serve(ln net.Listener) error {
 		os.Exit(0)
 	}()
 
-	if err := llm.Init(s.WorkDir); err != nil {
+	if err := llm.Init(); err != nil {
 		return fmt.Errorf("unable to initialize llm library %w", err)
 	}
 	if runtime.GOOS == "linux" { // TODO - windows too