From b3f75fc812fc1559090a7fd9739bd203817a5979 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 14 Aug 2024 14:37:51 -0700
Subject: [PATCH 01/34] fix noprune

---
 server/images.go | 63 ++++++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 39 deletions(-)

diff --git a/server/images.go b/server/images.go
index 0e753f56..798ed818 100644
--- a/server/images.go
+++ b/server/images.go
@@ -215,25 +215,20 @@ func GetManifest(mp ModelPath) (*Manifest, string, error) {
 		return nil, "", err
 	}
 
-	if _, err = os.Stat(fp); err != nil {
-		return nil, "", err
-	}
-
-	var manifest *Manifest
-
-	bts, err := os.ReadFile(fp)
+	f, err := os.Open(fp)
 	if err != nil {
-		return nil, "", fmt.Errorf("couldn't open file '%s'", fp)
+		return nil, "", err
 	}
+	defer f.Close()
 
-	shaSum := sha256.Sum256(bts)
-	shaStr := hex.EncodeToString(shaSum[:])
+	sha256sum := sha256.New()
 
-	if err := json.Unmarshal(bts, &manifest); err != nil {
+	var manifest Manifest
+	if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&manifest); err != nil {
 		return nil, "", err
 	}
 
-	return manifest, shaStr, nil
+	return &manifest, hex.EncodeToString(sha256sum.Sum(nil)), nil
 }
 
 func GetModel(name string) (*Model, error) {
@@ -716,7 +711,7 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{})
 		// save (i.e. delete from the deleteMap) any files used in other manifests
 		manifest, _, err := GetManifest(fmp)
 		if err != nil {
-			return err
+			return fmt.Errorf("error reading manifest %s: %w", path, err)
 		}
 
 		for _, layer := range manifest.Layers {
@@ -781,8 +776,7 @@ func PruneLayers() error {
 
 	slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
 
-	err = deleteUnusedLayers(nil, deleteMap)
-	if err != nil {
+	if err := deleteUnusedLayers(nil, deleteMap); err != nil {
 		slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
 		return nil
 	}
@@ -877,26 +871,19 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)
 
-	var manifest *Manifest
-	var err error
-	var noprune string
-
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]struct{})
-
-	if !envconfig.NoPrune() {
-		manifest, _, err = GetManifest(mp)
-		if err != nil && !errors.Is(err, os.ErrNotExist) {
-			return err
+	manifest, _, err := GetManifest(mp)
+	if errors.Is(err, os.ErrNotExist) {
+		// noop
+	} else if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	} else {
+		for _, l := range manifest.Layers {
+			deleteMap[l.Digest] = struct{}{}
 		}
-
-		if manifest != nil {
-			for _, l := range manifest.Layers {
-				deleteMap[l.Digest] = struct{}{}
-			}
-			if manifest.Config.Digest != "" {
-				deleteMap[manifest.Config.Digest] = struct{}{}
-			}
+		if manifest.Config.Digest != "" {
+			deleteMap[manifest.Config.Digest] = struct{}{}
 		}
 	}
 
@@ -975,11 +962,9 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		return err
 	}
 
-	if noprune == "" {
-		fn(api.ProgressResponse{Status: "removing any unused layers"})
-		err = deleteUnusedLayers(nil, deleteMap)
-		if err != nil {
-			slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
+	if !envconfig.NoPrune() && len(deleteMap) > 0 {
+		fn(api.ProgressResponse{Status: "removing unused layers"})
+		if err := deleteUnusedLayers(nil, deleteMap); err != nil {
 			fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
 		}
 	}
@@ -1000,12 +985,12 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptio
 	}
 	defer resp.Body.Close()
 
-	var m *Manifest
+	var m Manifest
 	if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
 		return nil, err
 	}
 
-	return m, err
+	return &m, err
 }
 
 // GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer

From 237dccba1edb41bb65ed1ffc6eafdd40dd6085e4 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 14 Aug 2024 16:36:07 -0700
Subject: [PATCH 02/34] skip invalid manifest files

---
 server/images.go   | 35 +++++------------------------------
 server/manifest.go |  2 +-
 2 files changed, 6 insertions(+), 31 deletions(-)

diff --git a/server/images.go b/server/images.go
index 798ed818..8b3a67cf 100644
--- a/server/images.go
+++ b/server/images.go
@@ -687,43 +687,18 @@ func CopyModel(src, dst model.Name) error {
 	return err
 }
 
-func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error {
-	fp, err := GetManifestPath()
+func deleteUnusedLayers(deleteMap map[string]struct{}) error {
+	manifests, err := Manifests()
 	if err != nil {
 		return err
 	}
 
-	walkFunc := func(path string, info os.FileInfo, _ error) error {
-		if info.IsDir() {
-			return nil
-		}
-
-		dir, file := filepath.Split(path)
-		dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
-		tag := strings.Join([]string{dir, file}, ":")
-		fmp := ParseModelPath(tag)
-
-		// skip the manifest we're trying to delete
-		if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
-			return nil
-		}
-
-		// save (i.e. delete from the deleteMap) any files used in other manifests
-		manifest, _, err := GetManifest(fmp)
-		if err != nil {
-			return fmt.Errorf("error reading manifest %s: %w", path, err)
-		}
-
+	for _, manifest := range manifests {
 		for _, layer := range manifest.Layers {
 			delete(deleteMap, layer.Digest)
 		}
 
 		delete(deleteMap, manifest.Config.Digest)
-		return nil
-	}
-
-	if err := filepath.Walk(fp, walkFunc); err != nil {
-		return err
 	}
 
 	// only delete the files which are still in the deleteMap
@@ -776,7 +751,7 @@ func PruneLayers() error {
 
 	slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
 
-	if err := deleteUnusedLayers(nil, deleteMap); err != nil {
+	if err := deleteUnusedLayers(deleteMap); err != nil {
 		slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
 		return nil
 	}
@@ -964,7 +939,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 
 	if !envconfig.NoPrune() && len(deleteMap) > 0 {
 		fn(api.ProgressResponse{Status: "removing unused layers"})
-		if err := deleteUnusedLayers(nil, deleteMap); err != nil {
+		if err := deleteUnusedLayers(deleteMap); err != nil {
 			fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
 		}
 	}
diff --git a/server/manifest.go b/server/manifest.go
index 6a5d7b88..0f19641d 100644
--- a/server/manifest.go
+++ b/server/manifest.go
@@ -150,7 +150,7 @@ func Manifests() (map[model.Name]*Manifest, error) {
 
 			n := model.ParseNameFromFilepath(rel)
 			if !n.IsValid() {
-				slog.Warn("bad manifest name", "path", rel, "error", err)
+				slog.Warn("bad manifest name", "path", rel)
 				continue
 			}
 

From 3a75e74e34c976d596437c8aa14587ada562301e Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 15 Aug 2024 10:29:14 -0700
Subject: [PATCH 03/34] only skip invalid json manifests

---
 server/manifest.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/server/manifest.go b/server/manifest.go
index 0f19641d..6b04753f 100644
--- a/server/manifest.go
+++ b/server/manifest.go
@@ -5,6 +5,7 @@ import (
 	"encoding/hex"
 	"encoding/json"
 	"errors"
+	"fmt"
 	"io"
 	"log/slog"
 	"os"
@@ -155,9 +156,11 @@ func Manifests() (map[model.Name]*Manifest, error) {
 			}
 
 			m, err := ParseNamedManifest(n)
-			if err != nil {
+			if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
 				slog.Warn("bad manifest", "name", n, "error", err)
 				continue
+			} else if err != nil {
+				return nil, fmt.Errorf("%s: %w", n, err)
 			}
 
 			ms[n] = m

From a84c05cf9140c2eb288a6c7b56bb1c592bbaacc7 Mon Sep 17 00:00:00 2001
From: eust-w <eustancewu@gmail.com>
Date: Fri, 16 Aug 2024 06:00:12 +0800
Subject: [PATCH 04/34] fix: Add tooltip to system tray icon

- Updated setIcon method to include tooltip text for the system tray icon.
- Added NIF_TIP flag and set the tooltip text using UTF16 encoding.

Resolves: #6372
---
 app/tray/wintray/tray.go   | 8 +++++++-
 app/tray/wintray/w32api.go | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/app/tray/wintray/tray.go b/app/tray/wintray/tray.go
index ccd087a1..6f827893 100644
--- a/app/tray/wintray/tray.go
+++ b/app/tray/wintray/tray.go
@@ -11,6 +11,7 @@ import (
 	"path/filepath"
 	"sort"
 	"sync"
+	"syscall"
 	"unsafe"
 
 	"golang.org/x/sys/windows"
@@ -433,7 +434,12 @@ func (t *winTray) setIcon(src string) error {
 	t.muNID.Lock()
 	defer t.muNID.Unlock()
 	t.nid.Icon = h
-	t.nid.Flags |= NIF_ICON
+	t.nid.Flags |= NIF_ICON | NIF_TIP
+	if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil {
+		copy(t.nid.Tip[:], toolTipUTF16)
+	} else {
+		return err
+	}
 	t.nid.Size = uint32(unsafe.Sizeof(*t.nid))
 
 	return t.nid.modify()
diff --git a/app/tray/wintray/w32api.go b/app/tray/wintray/w32api.go
index a1e0381d..7c7c0ac8 100644
--- a/app/tray/wintray/w32api.go
+++ b/app/tray/wintray/w32api.go
@@ -61,6 +61,7 @@ const (
 	MIIM_SUBMENU        = 0x00000004
 	MIM_APPLYTOSUBMENUS = 0x80000000
 	NIF_ICON            = 0x00000002
+	NIF_TIP             = 0x00000004
 	NIF_INFO            = 0x00000010
 	NIF_MESSAGE         = 0x00000001
 	SW_HIDE             = 0

From bdc4308afb72d47ce63583427f810b02d569d58a Mon Sep 17 00:00:00 2001
From: zwwhdls <zww@hdls.me>
Date: Fri, 16 Aug 2024 11:43:19 +0800
Subject: [PATCH 05/34] fix: chmod new layer to 0o644 when creating it

Signed-off-by: zwwhdls <zww@hdls.me>
---
 server/layer.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/server/layer.go b/server/layer.go
index c666bd10..0bdee72b 100644
--- a/server/layer.go
+++ b/server/layer.go
@@ -51,6 +51,9 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) {
 		if err := os.Rename(temp.Name(), blob); err != nil {
 			return Layer{}, err
 		}
+		if err := os.Chmod(blob, 0o644); err != nil {
+			return Layer{}, err
+		}
 	}
 
 	return Layer{

From 0ad0e738cd7ed1266b3c210ad54dcd2b70142563 Mon Sep 17 00:00:00 2001
From: Richard Lyons <frob@cloudstaff.com>
Date: Sun, 18 Aug 2024 01:43:26 +0200
Subject: [PATCH 06/34] Override numParallel only if unset.

---
 server/sched.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/server/sched.go b/server/sched.go
index 9947fd32..4d9c0296 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -734,7 +734,9 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 
 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
 func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
-	*numParallel = 1
+	if *numParallel <= 0 {
+		*numParallel = 1
+        }
 	byLibrary := gpus.ByLibrary()
 	if len(byLibrary) <= 1 {
 		return gpus

From 9352eeb752531decccc7c6b91a07bc3dd5efa67e Mon Sep 17 00:00:00 2001
From: Richard Lyons <frob@cloudstaff.com>
Date: Sun, 18 Aug 2024 02:55:01 +0200
Subject: [PATCH 07/34] Reset NumCtx.

---
 server/sched.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/server/sched.go b/server/sched.go
index 4d9c0296..3fe6d7fc 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -736,6 +736,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
 	if *numParallel <= 0 {
 		*numParallel = 1
+                req.opts.NumCtx = req.origNumCtx
         }
 	byLibrary := gpus.ByLibrary()
 	if len(byLibrary) <= 1 {

From 885cf45087863aa2e064a05da99e8bd07d69970a Mon Sep 17 00:00:00 2001
From: Richard Lyons <frob@cloudstaff.com>
Date: Sun, 18 Aug 2024 03:07:16 +0200
Subject: [PATCH 08/34] Fix white space.

---
 server/sched.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/sched.go b/server/sched.go
index 3fe6d7fc..9d8c4144 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -736,8 +736,8 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
 	if *numParallel <= 0 {
 		*numParallel = 1
-                req.opts.NumCtx = req.origNumCtx
-        }
+		req.opts.NumCtx = req.origNumCtx
+	}
 	byLibrary := gpus.ByLibrary()
 	if len(byLibrary) <= 1 {
 		return gpus

From 9fddef3731842bd8f40d217da6b84ab7ef5dfe97 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 19 Aug 2024 09:20:52 -0700
Subject: [PATCH 09/34] server: limit upload parts to 16 (#6411)

---
 server/upload.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/upload.go b/server/upload.go
index 2f115436..020e8955 100644
--- a/server/upload.go
+++ b/server/upload.go
@@ -45,7 +45,7 @@ type blobUpload struct {
 }
 
 const (
-	numUploadParts          = 64
+	numUploadParts          = 16
 	minUploadPartSize int64 = 100 * format.MegaByte
 	maxUploadPartSize int64 = 1000 * format.MegaByte
 )

From 74d45f010276c2f2653f3ca8c4f76cb0552fb46e Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 8 Jul 2024 12:50:11 -0700
Subject: [PATCH 10/34] Refactor linux packaging

This adjusts linux to follow a similar model to windows with a discrete archive
(zip/tgz) to cary the primary executable, and dependent libraries. Runners are
still carried as payloads inside the main binary

Darwin retain the payload model where the go binary is fully self contained.
---
 .github/workflows/release.yaml |  1 -
 Dockerfile                     | 29 ++++++------
 app/ollama.iss                 | 11 +----
 envconfig/config.go            |  4 +-
 gpu/amd_common.go              |  2 +-
 gpu/amd_windows.go             |  2 +-
 gpu/gpu.go                     | 50 ++++++++++++++-------
 gpu/gpu_linux.go               |  2 +-
 llm/ext_server/CMakeLists.txt  |  3 +-
 llm/generate/gen_common.sh     | 17 ++++++-
 llm/generate/gen_linux.sh      | 81 ++++++++++++++++------------------
 llm/generate/gen_windows.ps1   | 43 +++++++++---------
 llm/server.go                  | 12 +++--
 scripts/build_linux.sh         | 10 ++---
 scripts/build_windows.ps1      | 12 ++---
 scripts/install.sh             | 31 ++++++++++---
 16 files changed, 171 insertions(+), 139 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 5ae630c3..9287f6f7 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -363,7 +363,6 @@ jobs:
       - run: |
           ./scripts/build_linux.sh
           ./scripts/build_docker.sh
-          mv dist/deps/* dist/
       - uses: actions/upload-artifact@v4
         with:
           name: dist-linux-amd64
diff --git a/Dockerfile b/Dockerfile
index c8efdd8a..120ddc21 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,6 +18,7 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
+ENV GOARCH amd64 
 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
@@ -28,6 +29,7 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
+ENV GOARCH arm64 
 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
 
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
@@ -40,15 +42,10 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
+ENV GOARCH amd64 
 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
-RUN mkdir /tmp/scratch && \
-    for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \
-        cp ${dep} /tmp/scratch/ || exit 1 ; \
-    done && \
-    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \
-    mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \
-    (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . )
-
+RUN mkdir -p ../../dist/linux-amd64/ollama_libs && \
+    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/ollama_libs && tar xf - )
 
 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
@@ -59,6 +56,7 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
+ENV GOARCH amd64 
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 
 FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
@@ -79,6 +77,7 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
+ENV GOARCH arm64
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 
 FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
@@ -95,12 +94,13 @@ COPY . .
 COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN go build -trimpath -o dist/linux-amd64/ollama .
 
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -109,23 +109,24 @@ ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath .
+RUN go build -trimpath -o dist/linux-arm64/ollama .
 
 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama
 FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/ollama /bin/ollama
 
 # Radeon images are much larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
 RUN update-pciids
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
 
diff --git a/app/ollama.iss b/app/ollama.iss
index dc6178f7..e9cf48ec 100644
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -91,16 +91,7 @@ Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
 Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
-#if DirExists("..\dist\windows-amd64\cuda")
-  Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
-#endif
-#if DirExists("..\dist\windows-amd64\oneapi")
-  Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
-#endif
-#if DirExists("..\dist\windows-amd64\rocm")
-  Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
-#endif
-
+Source: "..\dist\windows-amd64\ollama_libs\*"; DestDir: "{app}\ollama_libs\"; Flags: ignoreversion recursesubdirs
 
 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
diff --git a/envconfig/config.go b/envconfig/config.go
index b82b773d..7f0976c0 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -193,8 +193,8 @@ func RunnersDir() (p string) {
 	for _, root := range []string{filepath.Dir(exe), cwd} {
 		paths = append(paths,
 			root,
-			filepath.Join(root, "windows-"+runtime.GOARCH),
-			filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
+			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
+			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
 		)
 	}
 
diff --git a/gpu/amd_common.go b/gpu/amd_common.go
index 2839cb7c..05747208 100644
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) {
 	// Installer payload location if we're running the installed binary
 	exe, err := os.Executable()
 	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), "ollama_libs")
 		if rocmLibUsable(rocmTargetDir) {
 			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
 			return rocmTargetDir, nil
diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go
index edabeb43..5d25a966 100644
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
 	// Installer payload (if we're running from some other location)
 	localAppData := os.Getenv("LOCALAPPDATA")
 	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, "rocm")
+	rocmTargetDir := filepath.Join(appDir, "ollama_libs")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
diff --git a/gpu/gpu.go b/gpu/gpu.go
index dc124a3e..d0ae0f34 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -229,11 +229,7 @@ func GetGPUInfo() GpuInfoList {
 			return GpuInfoList{cpus[0].GpuInfo}
 		}
 
-		// On windows we bundle the nvidia library one level above the runner dir
-		depPath := ""
-		if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
-			depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
-		}
+		depPath := GetDepDir()
 
 		// Load ALL libraries
 		cHandles = initCudaHandles()
@@ -306,13 +302,6 @@ func GetGPUInfo() GpuInfoList {
 		if envconfig.IntelGPU() {
 			oHandles = initOneAPIHandles()
 			if oHandles != nil && oHandles.oneapi != nil {
-
-				// On windows we bundle the oneapi library one level above the runner dir
-				depPath = ""
-				if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
-					depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
-				}
-
 				for d := range oHandles.oneapi.num_drivers {
 					if oHandles.oneapi == nil {
 						// shouldn't happen
@@ -467,10 +456,12 @@ func GetGPUInfo() GpuInfoList {
 func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
 	var ldPaths []string
-	var patterns []string
 	gpuLibPaths := []string{}
 	slog.Debug("Searching for GPU library", "name", baseLibName)
 
+	// Start with our bundled libraries
+	patterns := []string{filepath.Join(GetDepDir(), baseLibName)}
+
 	switch runtime.GOOS {
 	case "windows":
 		ldPaths = strings.Split(os.Getenv("PATH"), ";")
@@ -479,13 +470,14 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	default:
 		return gpuLibPaths
 	}
-	// Start with whatever we find in the PATH/LD_LIBRARY_PATH
+
+	// Then with whatever we find in the PATH/LD_LIBRARY_PATH
 	for _, ldPath := range ldPaths {
 		d, err := filepath.Abs(ldPath)
 		if err != nil {
 			continue
 		}
-		patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
+		patterns = append(patterns, filepath.Join(d, baseLibName))
 	}
 	patterns = append(patterns, defaultPatterns...)
 	slog.Debug("gpu library search", "globs", patterns)
@@ -641,3 +633,31 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 		return "", ""
 	}
 }
+
+func GetDepDir() string {
+	// On Windows/linux we bundle the dependencies at the same level as the executable
+	appExe, err := os.Executable()
+	if err != nil {
+		slog.Warn("failed to lookup executable path", "error", err)
+	}
+	cwd, err := os.Getwd()
+	if err != nil {
+		slog.Warn("failed to lookup working directory", "error", err)
+	}
+	// Scan for any of our dependeices, and pick first match
+	for _, root := range []string{filepath.Dir(appExe), cwd} {
+		libDep := "ollama_libs"
+		if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
+			return filepath.Join(root, libDep)
+		}
+		// Developer mode, local build
+		if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
+			return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
+		}
+		if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
+			return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
+		}
+	}
+	slog.Warn("unable to locate gpu dependency libraries")
+	return ""
+}
diff --git a/gpu/gpu_linux.go b/gpu/gpu_linux.go
index d6d2675c..d4d20bc4 100644
--- a/gpu/gpu_linux.go
+++ b/gpu/gpu_linux.go
@@ -47,7 +47,7 @@ var (
 	CudartMgmtName = "libcudart.so*"
 	NvcudaMgmtName = "libcuda.so*"
 	NvmlMgmtName   = "" // not currently wired on linux
-	OneapiMgmtName = "libze_intel_gpu.so"
+	OneapiMgmtName = "libze_intel_gpu.so*"
 )
 
 func GetCPUMem() (memInfo, error) {
diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt
index bfc97c63..90fd0ef2 100644
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -1,12 +1,13 @@
 set(TARGET ollama_llama_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
     SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
 if (WIN32)
     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh
index da1b0688..f1541f2a 100644
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -9,11 +9,14 @@ init_vars() {
         ARCH="arm64"
         ;;
     *)
-        ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
+        echo "GOARCH must be set"
+        echo "this script is meant to be run from within go generate"
+        exit 1
+        ;;
     esac
 
     LLAMACPP_DIR=../llama.cpp
-    CMAKE_DEFS=""
+    CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
     CMAKE_TARGETS="--target ollama_llama_server"
     if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
         CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
@@ -27,6 +30,7 @@ init_vars() {
         WHOLE_ARCHIVE="-Wl,-force_load"
         NO_WHOLE_ARCHIVE=""
         GCC_ARCH="-arch ${ARCH}"
+        DIST_BASE=../../dist/darwin-${GOARCH}/
         ;;
     "Linux")
         LIB_EXT="so"
@@ -35,6 +39,7 @@ init_vars() {
 
         # Cross compiling not supported on linux - Use docker
         GCC_ARCH=""
+        DIST_BASE=../../dist/linux-${GOARCH}/
         ;;
     *)
         ;;
@@ -105,6 +110,14 @@ compress() {
     echo "Finished compression"
 }
 
+install() {
+    echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
+    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
+        rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
+        cp -af "${lib}" "${BUILD_DIR}/bin/"
+    done
+}
+
 # Keep the local tree clean after we're done with the build
 cleanup() {
     (cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index db2c6c30..70fc0313 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
         export CUDACXX=$(command -v nvcc)
     fi
 fi
-COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
+COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
@@ -77,10 +77,11 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
     if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
         init_vars
         echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
-        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
+        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
         BUILD_DIR="../build/linux/${ARCH}/cpu"
         echo "Building custom CPU"
         build
+        install
         compress
     else
         # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@@ -93,7 +94,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
         # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
         # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
 
-        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
+        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
         if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
             #
             # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@@ -103,6 +104,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
             BUILD_DIR="../build/linux/${ARCH}/cpu"
             echo "Building LCD CPU"
             build
+            install
             compress
         fi
 
@@ -120,6 +122,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                 BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
                 echo "Building AVX CPU"
                 build
+                install
                 compress
             fi
 
@@ -133,6 +136,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                 BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
                 echo "Building AVX2 CPU"
                 build
+                install
                 compress
             fi
         fi
@@ -178,29 +182,18 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
         CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
         echo "Building custom CUDA GPU"
     else
-        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
+        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
     fi
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
+    export CUDAFLAGS="-t8"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
     BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
-    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
+    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
+    CUDA_DIST_DIR="${DIST_BASE}/ollama_libs"
     build
-
-    # Carry the CUDA libs as payloads to help reduce dependency burden on users
-    #
-    # TODO - in the future we may shift to packaging these separately and conditionally
-    #        downloading them in the install script.
-    DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
-    for lib in libcudart.so libcublas.so libcublasLt.so ; do
-        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
-        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
-            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
-        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
-            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
-        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
-            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
-        else
-            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
-        fi
+    install
+    mkdir -p "${CUDA_DIST_DIR}"
+    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
+        cp -a "${lib}" "${CUDA_DIST_DIR}"
     done
     compress
 
@@ -218,21 +211,24 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
     CC=icx
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
     BUILD_DIR="../build/linux/${ARCH}/oneapi"
-    EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
+    ONEAPI_DIST_DIR="${DIST_BASE}/ollama_libs"
+    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
     DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
     build
 
     # copy oneAPI dependencies
+    mkdir -p "${ONEAPI_DIST_DIR}"
     for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
-        cp "${dep}" "${BUILD_DIR}/bin/"
+        cp -a "${dep}" "${ONEAPI_DIST_DIR}"
     done
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
+    install
     compress
 fi
 
@@ -262,21 +258,18 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
         echo "Building custom ROCM GPU"
     fi
     BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
+    ROCM_DIST_DIR="${DIST_BASE}/ollama_libs"
+    # TODO figure out how to disable runpath (rpath)
+    # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
+    export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
     build
 
-    # Record the ROCM dependencies
-    rm -f "${BUILD_DIR}/bin/deps.txt"
-    touch "${BUILD_DIR}/bin/deps.txt"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
-        echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
+    # copy the ROCM dependencies
+    mkdir -p "${ROCM_DIST_DIR}"
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
+        cp -a "${dep}"* "${ROCM_DIST_DIR}"
     done
-    # bomb out if for some reason we didn't get a few deps
-    if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
-        cat "${BUILD_DIR}/bin/deps.txt"
-        echo "ERROR: deps file short"
-        exit 1
-    fi
+    install
     compress
 fi
 
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index d8bce92d..1f8c96d8 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -286,12 +286,11 @@ function build_cuda() {
         sign
         install
 
-        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null
+        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
     } else {
         write-host "Skipping CUDA generation step"
     }
@@ -325,18 +324,17 @@ function build_oneapi() {
     sign
     install
 
-    rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
   } else {
     Write-Host "Skipping oneAPI generation step"
   }
@@ -386,12 +384,11 @@ function build_rocm() {
         sign
         install
 
-        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
-        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\" -ea 0 > $null
+        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
         # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
-        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
+        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\"
     } else {
         write-host "Skipping ROCm generation step"
     }
diff --git a/llm/server.go b/llm/server.go
index d2b8db9b..9347a458 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -306,20 +306,18 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
-		// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
-		libraryPaths := []string{dir, filepath.Dir(dir)}
+		// Start with the server directory for the LD_LIBRARY_PATH/PATH
+		libraryPaths := []string{dir}
 
 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-			// Append our runner directory to the path
-			// This will favor system libraries over our bundled library dependencies
+			// favor our bundled library dependencies over system libraries
 			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
 		}
 
 		// Note: we always put the dependency path first
-		// since this was the exact version we verified for AMD GPUs
-		// and we favor what the user had in their path
+		// since this was the exact version we compiled/linked against
 		if gpus[0].DependencyPath != "" {
-			// TODO refine for multi-gpu support
+			// assume gpus from the same library have the same dependency path
 			libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
 		}
 
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index 27c4ff1f..4ea51229 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -21,11 +21,9 @@ for TARGETARCH in ${BUILD_ARCH}; do
         -t builder:$TARGETARCH \
         .
     docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
-    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/ollama ./dist/ollama-linux-$TARGETARCH
-
-    if [ "$TARGETARCH" = "amd64" ]; then
-        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/deps/ ./dist/
-    fi
-
+    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
     docker rm builder-$TARGETARCH
+    echo "Compressing final linux bundle..."
+    rm -f ./dist/ollama-linux-$TARGETARCH.tgz
+    (cd dist/linux-$TARGETARCH && tar cf - . | gzip --best > ../ollama-linux-$TARGETARCH.tgz )
 done
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index edc73759..e8d851f4 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -103,22 +103,22 @@ function buildApp() {
 function gatherDependencies() {
     write-host "Gathering runtime dependencies"
     cd "${script:SRC_DIR}"
-    md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
+    md "${script:DEPS_DIR}\ollama_libs" -ea 0 > $null
 
     # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
     # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_libs\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_libs\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_libs\"
     foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
-        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\"
+        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_libs\"
     }
 
 
     cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
     if ("${env:KEY_CONTAINER}") {
         write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+        foreach ($file in (get-childitem "${script:DEPS_DIR}\ollama_libs\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
             write-host "signing $file"
             & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                 /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
diff --git a/scripts/install.sh b/scripts/install.sh
index 03af5a69..f0439b00 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -63,16 +63,32 @@ if [ -n "$NEEDS" ]; then
     exit 1
 fi
 
-status "Downloading ollama..."
-curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
-
 for BINDIR in /usr/local/bin /usr/bin /bin; do
     echo $PATH | grep -q $BINDIR && break || continue
 done
+OLLAMA_INSTALL_DIR=${OLLAMA_INSTALL_DIR:-${BINDIR}}
 
-status "Installing ollama to $BINDIR..."
+status "Installing ollama to $OLLAMA_INSTALL_DIR"
 $SUDO install -o0 -g0 -m755 -d $BINDIR
-$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama
+$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR"
+if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then
+    status "Downloading Linux ${ARCH} bundle"
+    curl --fail --show-error --location --progress-bar \
+        "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
+        $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+    BUNDLE=1
+else
+    status "Downloading Linux ${ARCH} CLI"
+    curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
+    "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
+    $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
+    BUNDLE=0
+fi
+
+if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
+    status "Making ollama accessible in the PATH in $BINDIR"
+    $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
+fi
 
 install_success() {
     status 'The Ollama API is now available at 127.0.0.1:11434.'
@@ -178,6 +194,11 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg
 fi
 
 if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
+    if [ $BUNDLE -ne 0 ]; then
+        install_success
+        status "AMD GPU ready."
+        exit 0
+    fi
     # Look for pre-existing ROCm v6 before downloading the dependencies
     for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
         if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then

From c7bcb0031965e33531358639620a11516d101b54 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 9 Aug 2024 07:21:40 -0700
Subject: [PATCH 11/34] Wire up ccache and pigz in the docker based build

This should help speed things up a little
---
 Dockerfile                 | 37 ++++++++++++++++++++++++++-----------
 llm/generate/gen_common.sh | 15 +++++++++------
 llm/generate/gen_darwin.sh |  2 ++
 llm/generate/gen_linux.sh  |  2 ++
 scripts/build_linux.sh     |  3 ++-
 scripts/rh_linux_deps.sh   | 14 ++++++++++++--
 6 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 120ddc21..8eb90057 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,7 +19,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH amd64 
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
 ARG CMAKE_VERSION
@@ -30,7 +31,12 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH arm64 
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
+    CUDA_VARIANT="_v11" \
+    bash gen_linux.sh
 
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
 ARG CMAKE_VERSION
@@ -43,7 +49,8 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
 ENV GOARCH amd64 
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 RUN mkdir -p ../../dist/linux-amd64/ollama_libs && \
     (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/ollama_libs && tar xf - )
 
@@ -60,13 +67,17 @@ ENV GOARCH amd64
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 
 FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
-RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_CPU_TARGET="static" bash gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh
 
 FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
 ARG CMAKE_VERSION
@@ -81,9 +92,11 @@ ENV GOARCH arm64
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 
 FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
-RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_CPU_TARGET="static" bash gen_linux.sh
 FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
 
 
 # Intermediate stage used for ./scripts/build_linux.sh
@@ -100,7 +113,8 @@ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath -o dist/linux-amd64/ollama .
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-amd64/ollama .
 
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -113,7 +127,8 @@ COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
-RUN go build -trimpath -o dist/linux-arm64/ollama .
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-arm64/ollama .
 
 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh
index f1541f2a..40115936 100644
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -47,6 +47,7 @@ init_vars() {
     if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
         CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
     fi
+    GZIP=$(which pigz 2>/dev/null || echo "gzip")
 }
 
 git_module_setup() {
@@ -90,21 +91,23 @@ build() {
 
 compress() {
     echo "Compressing payloads to reduce overall binary size..."
-    pids=""
     rm -rf ${BUILD_DIR}/bin/*.gz
     for f in ${BUILD_DIR}/bin/* ; do
-        gzip -n --best -f ${f} &
-        pids+=" $!"
+        ${GZIP} -n --best -f ${f} &
+        compress_pids+=" $!"
     done
     # check for lib directory
     if [ -d ${BUILD_DIR}/lib ]; then
         for f in ${BUILD_DIR}/lib/* ; do
-            gzip -n --best -f ${f} &
-            pids+=" $!"
+            ${GZIP} -n --best -f ${f} &
+            compress_pids+=" $!"
         done
     fi
     echo
-    for pid in ${pids}; do
+}
+
+wait_for_compress() {
+    for pid in ${compress_pids}; do
         wait $pid
     done
     echo "Finished compression"
diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
index 6c0b62cb..f22c0f8e 100755
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -6,6 +6,7 @@
 
 set -ex
 set -o pipefail
+compress_pids=""
 echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
@@ -98,4 +99,5 @@ case "${GOARCH}" in
 esac
 
 cleanup
+wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 70fc0313..1365d07d 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -13,6 +13,7 @@
 
 set -ex
 set -o pipefail
+compress_pids=""
 
 # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
 amdGPUs() {
@@ -274,4 +275,5 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
 fi
 
 cleanup
+wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index 4ea51229..ebb60c5a 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -4,6 +4,7 @@ set -eu
 
 export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
+GZIP=$(which pigz 2>/dev/null || echo "gzip")
 
 BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
@@ -25,5 +26,5 @@ for TARGETARCH in ${BUILD_ARCH}; do
     docker rm builder-$TARGETARCH
     echo "Compressing final linux bundle..."
     rm -f ./dist/ollama-linux-$TARGETARCH.tgz
-    (cd dist/linux-$TARGETARCH && tar cf - . | gzip --best > ../ollama-linux-$TARGETARCH.tgz )
+    (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
 done
diff --git a/scripts/rh_linux_deps.sh b/scripts/rh_linux_deps.sh
index 81648d68..b4c9afd6 100644
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@@ -3,6 +3,7 @@
 # Script for common Dockerfile dependency installation in redhat linux based images
 
 set -ex
+set -o pipefail
 MACHINE=$(uname -m)
 
 if grep -i "centos" /etc/system-release >/dev/null; then
@@ -29,7 +30,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
         dnf install -y rh-git227-git
         ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
     fi
-    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++
+    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz
 elif grep -i "rocky" /etc/system-release >/dev/null; then
     # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
     cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
@@ -43,12 +44,21 @@ gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial
 EOF
     dnf install -y git \
         gcc-toolset-10-gcc-10.2.1-8.2.el8 \
-        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8
+        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
+        pigz
 else
     echo "ERROR Unexpected distro"
     exit 1
 fi
 
+if [ "${MACHINE}" = "x86_64" ] ; then
+    curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \
+    mv /tmp/ccache /usr/local/bin/
+else
+    yum -y install epel-release
+    yum install -y ccache
+fi
+
 if [ -n "${CMAKE_VERSION}" ]; then
     curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
 fi

From d470ebe78bc76c098bc378f98f08f7094063ab4d Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Thu, 30 May 2024 21:54:07 -0700
Subject: [PATCH 12/34] Add Jetson cuda variants for arm

This adds new variants for arm64 specific to Jetson platforms
---
 Dockerfile                | 48 +++++++++++++++++++++++++++++++++++----
 gpu/gpu.go                | 44 +++++++++++++++++++++++++++++++++--
 gpu/gpu_darwin.go         |  4 ++--
 gpu/types.go              |  6 ++---
 llm/generate/gen_linux.sh |  5 ++--
 llm/payload.go            |  4 ++--
 scripts/build_linux.sh    |  1 +
 7 files changed, 96 insertions(+), 16 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 8eb90057..79b2a696 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,6 +3,9 @@ ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1
 ARG ROCM_VERSION=6.1.2
+ARG JETPACK_6=r36.2.0
+ARG JETPACK_5=r35.4.1
+ARG JETPACK_4=r32.7.1
 
 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code
@@ -22,7 +25,7 @@ ENV GOARCH amd64
 RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -31,11 +34,40 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH arm64 
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
+ARG CMAKE_VERSION
+RUN apt-get update && apt-get install -y git curl && \
+    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+ENV GOARCH arm64
+ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
 RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 \
     OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
-    CUDA_VARIANT="_v11" \
+    CUDA_VARIANT="_jetpack6" \
+    CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack6" \
+    CMAKE_CUDA_ARCHITECTURES="87" \
+    bash gen_linux.sh
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS cuda-build-jetpack5-arm64
+ARG CMAKE_VERSION
+RUN apt-get update && apt-get install -y git curl && \
+    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+ENV GOARCH arm64
+ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CUDA_VARIANT="_jetpack5" \
+    CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack5" \
+    CMAKE_CUDA_ARCHITECTURES="72;87" \
     bash gen_linux.sh
 
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
@@ -123,8 +155,14 @@ ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+## arm binary += 381M 
+COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+## arm binary += 330M
+COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
diff --git a/gpu/gpu.go b/gpu/gpu.go
index d0ae0f34..22461922 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -15,7 +15,9 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
+	"regexp"
 	"runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
@@ -215,7 +217,7 @@ func GetGPUInfo() GpuInfoList {
 				GpuInfo: GpuInfo{
 					memInfo: mem,
 					Library: "cpu",
-					Variant: cpuCapability,
+					Variant: cpuCapability.String(),
 					ID:      "0",
 				},
 			},
@@ -231,6 +233,35 @@ func GetGPUInfo() GpuInfoList {
 
 		depPath := GetDepDir()
 
+		var cudaVariant string
+		if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
+			if CudaTegra != "" {
+				ver := strings.Split(CudaTegra, ".")
+				if len(ver) > 0 {
+					cudaVariant = "jetpack" + ver[0]
+				}
+			} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
+				r := regexp.MustCompile(` R(\d+) `)
+				m := r.FindSubmatch(data)
+				if len(m) != 2 {
+					slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
+				} else {
+					if l4t, err := strconv.Atoi(string(m[1])); err == nil {
+						// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
+						// https://developer.nvidia.com/embedded/jetpack-archive
+						switch l4t {
+						case 35:
+							cudaVariant = "jetpack5"
+						case 36:
+							cudaVariant = "jetpack6"
+						default:
+							slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
+						}
+					}
+				}
+			}
+		}
+
 		// Load ALL libraries
 		cHandles = initCudaHandles()
 
@@ -240,6 +271,7 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo := CudaGPUInfo{
 					GpuInfo: GpuInfo{
 						Library: "cuda",
+						Variant: cudaVariant,
 					},
 					index: i,
 				}
@@ -266,7 +298,15 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
-				gpuInfo.DependencyPath = depPath
+				if depPath != "" {
+					gpuInfo.DependencyPath = depPath
+					// Check for variant specific directory
+					if cudaVariant != "" {
+						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+cudaVariant)); err == nil {
+							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+cudaVariant)
+						}
+					}
+				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go
index 9d9fd84e..417b48df 100644
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
-				Variant: GetCPUCapability(),
+				Variant: GetCPUCapability().String(),
 				memInfo: mem,
 			},
 		}
@@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList {
 	return []GpuInfo{
 		{
 			Library: "cpu",
-			Variant: GetCPUCapability(),
+			Variant: GetCPUCapability().String(),
 			memInfo: mem,
 		},
 	}
diff --git a/gpu/types.go b/gpu/types.go
index 8d22b06b..fc628d47 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -19,7 +19,7 @@ type GpuInfo struct {
 	Library string `json:"library,omitempty"`
 
 	// Optional variant to select (e.g. versions, cpu feature flags)
-	Variant CPUCapability `json:"variant"`
+	Variant string `json:"variant"`
 
 	// MinimumMemory represents the minimum memory required to use the GPU
 	MinimumMemory uint64 `json:"-"`
@@ -81,8 +81,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != CPUCapabilityNone {
-			requested += "_" + info.Variant.String()
+		if info.Variant != CPUCapabilityNone.String() {
+			requested += "_" + info.Variant
 		}
 		for i, lib := range libs {
 			if lib == requested {
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 1365d07d..dc9dda5a 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -165,7 +165,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
     echo "CUDA libraries detected - building dynamic CUDA library"
     init_vars
     CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
-    if [ -n "${CUDA_MAJOR}" ]; then
+    if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
         CUDA_VARIANT=_v${CUDA_MAJOR}
     fi
     if [ "${ARCH}" == "arm64" ]; then
@@ -189,9 +189,10 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
     BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
     export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
-    CUDA_DIST_DIR="${DIST_BASE}/ollama_libs"
+    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/ollama_libs}"
     build
     install
+    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
     mkdir -p "${CUDA_DIST_DIR}"
     for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
         cp -a "${lib}" "${CUDA_DIST_DIR}"
diff --git a/llm/payload.go b/llm/payload.go
index b402e1f2..963b3295 100644
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := getAvailableServers()
 	requested := info.Library
-	if info.Variant != gpu.CPUCapabilityNone {
-		requested += "_" + info.Variant.String()
+	if info.Variant != gpu.CPUCapabilityNone.String() {
+		requested += "_" + info.Variant
 	}
 
 	servers := []string{}
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index ebb60c5a..adda2ad7 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -22,6 +22,7 @@ for TARGETARCH in ${BUILD_ARCH}; do
         -t builder:$TARGETARCH \
         .
     docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
+    rm -rf ./dist/linux-$TARGETARCH
     docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
     docker rm builder-$TARGETARCH
     echo "Compressing final linux bundle..."

From fc3b4cda89f468f923e2e6095c6a62a5c3c336ff Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 19 Jun 2024 09:36:30 -0700
Subject: [PATCH 13/34] Report GPU variant in log

---
 gpu/types.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gpu/types.go b/gpu/types.go
index fc628d47..88539078 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -105,6 +105,7 @@ func (l GpuInfoList) LogDetails() {
 		slog.Info("inference compute",
 			"id", g.ID,
 			"library", g.Library,
+			"variant", g.Variant,
 			"compute", g.Compute,
 			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
 			"name", g.Name,

From 4fe3a556faf790ba993223cfdda16e281b6cb76d Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Thu, 13 Jun 2024 20:46:14 -0700
Subject: [PATCH 14/34] Add cuda v12 variant and selection logic

Based on compute capability and driver version, pick
v12 or v11 cuda variants.
---
 Dockerfile         | 43 +++++++++++++++++++++++++++++++++----------
 gpu/cuda_common.go | 43 +++++++++++++++++++++++++++++++++++++++++++
 gpu/gpu.go         | 40 ++++------------------------------------
 gpu/types.go       |  6 ++++--
 4 files changed, 84 insertions(+), 48 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 79b2a696..e200f5d4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
-# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
-ARG CUDA_VERSION=11.3.1
+ARG CUDA_VERSION_11=11.3.1
+ARG CUDA_VERSION_12=12.4.0
 ARG ROCM_VERSION=6.1.2
 ARG JETPACK_6=r36.2.0
 ARG JETPACK_5=r35.4.1
@@ -13,7 +13,7 @@ COPY .git .git
 COPY .gitmodules .gitmodules
 COPY llm llm
 
-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -23,9 +23,29 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \
+    CUDA_VARIANT="_v11" \
+    bash gen_linux.sh
 
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64
+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+ENV GOARCH amd64 
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \
+    CUDA_VARIANT="_v12" \
+    bash gen_linux.sh
+
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -34,7 +54,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH arm64 
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
 ARG CMAKE_VERSION
@@ -139,8 +160,10 @@ COPY . .
 COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
@@ -155,8 +178,8 @@ ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ## arm binary += 381M 
 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
diff --git a/gpu/cuda_common.go b/gpu/cuda_common.go
index c90a644c..defaa60a 100644
--- a/gpu/cuda_common.go
+++ b/gpu/cuda_common.go
@@ -4,9 +4,17 @@ package gpu
 
 import (
 	"log/slog"
+	"os"
+	"regexp"
+	"runtime"
+	"strconv"
 	"strings"
 )
 
+// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
+// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
+var CudaTegra string = os.Getenv("JETSON_JETPACK")
+
 func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
@@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	}
 	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
+
+func cudaGetVariant(gpuInfo CudaGPUInfo) string {
+	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
+		if CudaTegra != "" {
+			ver := strings.Split(CudaTegra, ".")
+			if len(ver) > 0 {
+				return "jetpack" + ver[0]
+			}
+		} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
+			r := regexp.MustCompile(` R(\d+) `)
+			m := r.FindSubmatch(data)
+			if len(m) != 2 {
+				slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
+			} else {
+				if l4t, err := strconv.Atoi(string(m[1])); err == nil {
+					// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
+					// https://developer.nvidia.com/embedded/jetpack-archive
+					switch l4t {
+					case 35:
+						return "jetpack5"
+					case 36:
+						return "jetpack6"
+					default:
+						slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
+					}
+				}
+			}
+		}
+	}
+
+	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
+		return "v11"
+	}
+	return "v12"
+}
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 22461922..eb87807a 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -15,9 +15,7 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
-	"regexp"
 	"runtime"
-	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
@@ -66,10 +64,6 @@ var RocmComputeMin = 9
 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
 
-// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
-// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
-var CudaTegra string = os.Getenv("JETSON_JETPACK")
-
 // Note: gpuMutex must already be held
 func initCudaHandles() *cudaHandles {
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
@@ -233,35 +227,6 @@ func GetGPUInfo() GpuInfoList {
 
 		depPath := GetDepDir()
 
-		var cudaVariant string
-		if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
-			if CudaTegra != "" {
-				ver := strings.Split(CudaTegra, ".")
-				if len(ver) > 0 {
-					cudaVariant = "jetpack" + ver[0]
-				}
-			} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
-				r := regexp.MustCompile(` R(\d+) `)
-				m := r.FindSubmatch(data)
-				if len(m) != 2 {
-					slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
-				} else {
-					if l4t, err := strconv.Atoi(string(m[1])); err == nil {
-						// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
-						// https://developer.nvidia.com/embedded/jetpack-archive
-						switch l4t {
-						case 35:
-							cudaVariant = "jetpack5"
-						case 36:
-							cudaVariant = "jetpack6"
-						default:
-							slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
-						}
-					}
-				}
-			}
-		}
-
 		// Load ALL libraries
 		cHandles = initCudaHandles()
 
@@ -271,7 +236,6 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo := CudaGPUInfo{
 					GpuInfo: GpuInfo{
 						Library: "cuda",
-						Variant: cudaVariant,
 					},
 					index: i,
 				}
@@ -297,7 +261,10 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.FreeMemory = uint64(memInfo.free)
 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
+				gpuInfo.computeMajor = int(memInfo.major)
+				gpuInfo.computeMinor = int(memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
+				cudaVariant := cudaGetVariant(gpuInfo)
 				if depPath != "" {
 					gpuInfo.DependencyPath = depPath
 					// Check for variant specific directory
@@ -310,6 +277,7 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
+				gpuInfo.Variant = cudaGetVariant(gpuInfo)
 
 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
diff --git a/gpu/types.go b/gpu/types.go
index 88539078..4cbbeb84 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -53,8 +53,10 @@ type CPUInfo struct {
 
 type CudaGPUInfo struct {
 	GpuInfo
-	OSOverhead uint64 // Memory overhead between the driver library and management library
-	index      int    //nolint:unused,nolintlint
+	OSOverhead   uint64 // Memory overhead between the driver library and management library
+	index        int    //nolint:unused,nolintlint
+	computeMajor int    //nolint:unused,nolintlint
+	computeMinor int    //nolint:unused,nolintlint
 }
 type CudaGPUInfoList []CudaGPUInfo
 

From f6c811b32075cb3b7633d7d4213251d474a77682 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 12 Jul 2024 11:35:41 -0700
Subject: [PATCH 15/34] Enable cuda v12 flags

---
 Dockerfile | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index e200f5d4..e83a266a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,9 @@
 ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION_11=11.3.1
+ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
+ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
 ARG JETPACK_6=r36.2.0
 ARG JETPACK_5=r35.4.1
@@ -21,11 +23,12 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
+ARG CUDA_V11_ARCHITECTURES
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 \
     OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
     CUDA_VARIANT="_v11" \
     bash gen_linux.sh
 
@@ -37,12 +40,14 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
+ARG CUDA_V12_ARCHITECTURES
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 \
     OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
     CUDA_VARIANT="_v12" \
+    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
     bash gen_linux.sh
 
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
@@ -53,9 +58,31 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
+ARG CUDA_V11_ARCHITECTURES
+ENV GOARCH arm64 
+RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
+    CUDA_VARIANT="_v11" \
+    bash gen_linux.sh
+
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+ARG CUDA_V12_ARCHITECTURES
 ENV GOARCH arm64 
 RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
+    CUDA_VARIANT="_v12" \
+    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
+    bash gen_linux.sh
 
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
 ARG CMAKE_VERSION
@@ -180,6 +207,8 @@ COPY . .
 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ## arm binary += 381M 
 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/

From 927d98a6cde43ffee3ef269cf013df5e96cbe767 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 12 Jul 2024 14:33:13 -0700
Subject: [PATCH 16/34] Add windows cuda v12 + v11 support

---
 .github/workflows/release.yaml | 93 ++++++++++++++++++++++++++++++++--
 llm/generate/gen_windows.ps1   |  6 +--
 scripts/build_windows.ps1      | 63 ++++++++++++++++++-----
 3 files changed, 142 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 9287f6f7..4bd68455 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -183,8 +183,8 @@ jobs:
           name: windows-rocm-deps
           path: dist/deps/*
 
-  # CUDA generation step
-  generate-windows-cuda:
+  # CUDA v11 generation step
+  generate-windows-cuda-v11:
     environment: release
     runs-on: windows
     env:
@@ -256,7 +256,89 @@ jobs:
           cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
       - uses: actions/upload-artifact@v4
         with:
-          name: generate-windows-cuda
+          name: generate-windows-cuda-v11
+          path: |
+            llm/build/**/bin/*
+            dist/windows-amd64/**
+      - uses: actions/upload-artifact@v4
+        with:
+          name: windows-cuda-deps
+          path: dist/deps/*
+
+  # CUDA v12 generation step
+  generate-windows-cuda-v12:
+    environment: release
+    runs-on: windows
+    env:
+      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set Version
+        shell: bash
+        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
+      - uses: 'google-github-actions/auth@v2'
+        with:
+          project_id: 'ollama'
+          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
+      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
+      - name: install Windows SDK 8.1 to get signtool
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "downloading SDK"
+          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
+          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
+          write-host "Win SDK 8.1 installed"
+          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
+      - name: install signing plugin
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "downloading plugin"
+          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
+          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
+          write-host "Installing plugin"
+          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
+          write-host "plugin installed"
+      - uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+          cache: true
+      - name: 'Install CUDA'
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "downloading CUDA Installer"
+          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
+          write-host "Installing CUDA"
+          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
+          write-host "Completed CUDA"
+          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
+          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
+          echo "$cudaPath\bin" >> $env:GITHUB_PATH
+          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
+          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
+          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
+      - name: 'Verify CUDA'
+        run: nvcc -V
+      - run: go get ./...
+      - name: go generate
+        run: |
+          $gopath=(get-command go).source | split-path -parent
+          $cudabin=(get-command nvcc).source | split-path
+          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          cd $env:GITHUB_WORKSPACE
+          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$cudabin;$env:PATH"
+          $env:OLLAMA_SKIP_CPU_GENERATE="1"
+          go generate -x ./...
+      - name: 'gather cuda dependencies'
+        run: |
+          $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
+          md "dist\deps"
+          cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\"
+          cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\"
+          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
+      - uses: actions/upload-artifact@v4
+        with:
+          name: generate-windows-cuda-v12
           path: |
             llm/build/**/bin/*
             dist/windows-amd64/**
@@ -270,7 +352,8 @@ jobs:
     environment: release
     runs-on: windows
     needs:
-      - generate-windows-cuda
+      - generate-windows-cuda-v11
+      - generate-windows-cuda-v12
       - generate-windows-rocm
       - generate-windows-cpu
     env:
@@ -314,7 +397,7 @@ jobs:
           name: generate-windows-cpu
       - uses: actions/download-artifact@v4
         with:
-          name: generate-windows-cuda
+          name: generate-windows-cuda-v11
       - uses: actions/download-artifact@v4
         with:
           name: windows-cuda-deps
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 1f8c96d8..42708d3e 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -261,7 +261,7 @@ function build_cuda() {
     if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
         # Then build cuda as a dynamically loaded library
         $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
-        $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
+        $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
         if ($null -ne $script:CUDA_VERSION) {
             $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
         }
@@ -273,9 +273,9 @@ function build_cuda() {
             "-DGGML_CUDA=ON",
             "-DGGML_AVX=on",
             "-DGGML_AVX2=off",
-            "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
             "-DCMAKE_CUDA_FLAGS=-t8",
-            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
+            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
+            "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
             )
         if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
             write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index e8d851f4..50b60230 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -7,6 +7,7 @@
 $ErrorActionPreference = "Stop"
 
 function checkEnv() {
+    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
     $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
     Write-host "Building for ${script:TARGET_ARCH}"
     write-host "Locating required tools and paths"
@@ -15,26 +16,23 @@ function checkEnv() {
         $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
         $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
     }
-    # Try to find the CUDA dir
-    if ($null -eq $env:NVIDIA_DIR) {
+    # Locate CUDA versions
+    # Note: this assumes every version found will be built
+    $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
+    if ($cudaList.length -eq 0) {
         $d=(get-command -ea 'silentlycontinue' nvcc).path
-        if ($d -ne $null) {
-            $script:NVIDIA_DIR=($d| split-path -parent)
-        } else {
-            $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
-            if ($cudaList.length > 0) {
-                $script:NVIDIA_DIR=$cudaList[0]
-            }
+        if ($null -ne $d) {
+            $script:CUDA_DIRS=@($d| split-path -parent)
         }
     } else {
-        $script:NVIDIA_DIR=$env:NVIDIA_DIR
+        $script:CUDA_DIRS=$cudaList
     }
     
     $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]
 
     $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
     $env:CGO_ENABLED="1"
-    echo "Checking version"
+    Write-Output "Checking version"
     if (!$env:VERSION) {
         $data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always)
         $pattern="v(.+)"
@@ -71,7 +69,48 @@ function checkEnv() {
 function buildOllama() {
     write-host "Building ollama CLI"
     if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
-        & go generate ./...
+        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
+
+        # TODO - consider trying to parallelize this with Start-ThreadJob, but env vars can't be used to toggle
+        #        which targets to build
+
+        # Start by skipping CUDA to build everything else
+        pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
+        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
+
+        # Then skip everyhting else and build all the CUDA variants
+        foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
+            write-host "Building CUDA ${env:CUDA_LIB_DIR}"
+
+            if ($env:CUDA_LIB_DIR.Contains("v12")) {
+                pwsh -Command {
+                    $env:OLLAMA_SKIP_CUDA_GENERATE=""
+                    $env:OLLAMA_SKIP_STATIC_GENERATE="1"
+                    $env:OLLAMA_SKIP_CPU_GENERATE="1"
+                    $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
+                    $env:OLLAMA_SKIP_ROCM_GENERATE="1"
+                    $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
+                    $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on"
+                    $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
+                    $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
+                    & go generate ./...
+                }
+            } else {
+                pwsh -Command {
+                    $env:OLLAMA_SKIP_CUDA_GENERATE=""
+                    $env:OLLAMA_SKIP_STATIC_GENERATE="1"
+                    $env:OLLAMA_SKIP_CPU_GENERATE="1"
+                    $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
+                    $env:OLLAMA_SKIP_ROCM_GENERATE="1"
+                    $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
+                    $env:OLLAMA_CUSTOM_CUDA_DEFS=""
+                    $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
+                    $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
+                    & go generate ./...
+                }
+            }
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        }
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
     } else {
         write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"

From 3b19cdba2a090772b2e886dbfbf712992fafe0cd Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 13 Aug 2024 13:30:28 -0700
Subject: [PATCH 17/34] Remove Jetpack

---
 Dockerfile | 42 ------------------------------------------
 1 file changed, 42 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index e83a266a..99ba5b65 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,9 +5,6 @@ ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
-ARG JETPACK_6=r36.2.0
-ARG JETPACK_5=r35.4.1
-ARG JETPACK_4=r32.7.1
 
 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code
@@ -84,39 +81,6 @@ RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
     bash gen_linux.sh
 
-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
-ARG CMAKE_VERSION
-RUN apt-get update && apt-get install -y git curl && \
-    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ENV GOARCH arm64
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CUDA_VARIANT="_jetpack6" \
-    CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack6" \
-    CMAKE_CUDA_ARCHITECTURES="87" \
-    bash gen_linux.sh
-
-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS cuda-build-jetpack5-arm64
-ARG CMAKE_VERSION
-RUN apt-get update && apt-get install -y git curl && \
-    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ENV GOARCH arm64
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CUDA_VARIANT="_jetpack5" \
-    CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack5" \
-    CMAKE_CUDA_ARCHITECTURES="72;87" \
-    bash gen_linux.sh
 
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
 ARG CMAKE_VERSION
@@ -209,12 +173,6 @@ COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ di
 COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-## arm binary += 381M 
-COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-## arm binary += 330M
-COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \

From 88bb9e332877dfbba40030c19570fdbe00f41a21 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 14 Aug 2024 16:32:57 -0700
Subject: [PATCH 18/34] Adjust layout to bin+lib/ollama

---
 Dockerfile                   | 23 ++++++++++++++------
 app/ollama.iss               | 12 +++++------
 docs/linux.md                | 10 ++++-----
 envconfig/config.go          |  6 +++---
 gpu/amd_common.go            |  2 +-
 gpu/amd_windows.go           |  2 +-
 gpu/gpu.go                   |  4 ++--
 llm/generate/gen_linux.sh    |  6 +++---
 llm/generate/gen_windows.ps1 | 42 ++++++++++++++++++------------------
 scripts/build_windows.ps1    | 16 +++++++-------
 scripts/install.sh           | 14 +++++++-----
 11 files changed, 74 insertions(+), 63 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 99ba5b65..d4b86918 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -95,8 +95,8 @@ ARG AMDGPU_TARGETS
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
-RUN mkdir -p ../../dist/linux-amd64/ollama_libs && \
-    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/ollama_libs && tar xf - )
+RUN mkdir -p ../../dist/linux-amd64/lib/ollama && \
+    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/lib/ollama && tar xf - )
 
 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
@@ -160,7 +160,7 @@ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ l
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/ollama .
+    go build -trimpath -o dist/linux-amd64/bin/ollama .
 
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
@@ -176,20 +176,29 @@ COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/buil
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/ollama .
+    go build -trimpath -o dist/linux-arm64/bin/ollama .
+
+# Strip out ROCm dependencies to keep the primary image lean
+FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/
+RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa* 
 
 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
+COPY --from=amd64-libs-without-rocm /scratch/ /lib/
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+
 FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 RUN apt-get update && apt-get install -y ca-certificates
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/ollama /bin/ollama
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
 
 # Radeon images are much larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm
 RUN update-pciids
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+RUN ln -s /opt/rocm/lib /lib/ollama
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
 
diff --git a/app/ollama.iss b/app/ollama.iss
index e9cf48ec..bce0a337 100644
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -87,11 +87,11 @@ DialogFontSize=12
 
 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
-Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
+Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit
+Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
-Source: "..\dist\windows-amd64\ollama_libs\*"; DestDir: "{app}\ollama_libs\"; Flags: ignoreversion recursesubdirs
+Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs
 
 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
@@ -99,7 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
 Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
 
 [Run]
-Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
+Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
 
 [UninstallRun]
 ; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
@@ -134,8 +134,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
 
 [Registry]
 Root: HKCU; Subkey: "Environment"; \
-    ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
-    Check: NeedsAddPath('{app}')
+    ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \
+    Check: NeedsAddPath('{app}\bin')
 
 [Code]
 
diff --git a/docs/linux.md b/docs/linux.md
index ec730656..3ed2bed0 100644
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -20,13 +20,12 @@ GPU.
 
 ## Manual install
 
-### Download the `ollama` binary
+### Download the `ollama` tar file
 
-Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
+Ollama is distributed as a tar file including GPU library dependencies.
 
 ```bash
-sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
-sudo chmod +x /usr/bin/ollama
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf -
 ```
 
 ### Adding Ollama as a startup service (recommended)
@@ -96,8 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh
 Or by downloading the ollama binary:
 
 ```bash
-sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
-sudo chmod +x /usr/bin/ollama
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf -
 ```
 
 ## Installing specific versions
diff --git a/envconfig/config.go b/envconfig/config.go
index 7f0976c0..7e45a4f5 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -174,7 +174,7 @@ func RunnersDir() (p string) {
 
 	defer func() {
 		if p == "" {
-			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
+			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
 		}
 	}()
 
@@ -190,7 +190,7 @@ func RunnersDir() (p string) {
 	}
 
 	var paths []string
-	for _, root := range []string{filepath.Dir(exe), cwd} {
+	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} {
 		paths = append(paths,
 			root,
 			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
@@ -200,7 +200,7 @@ func RunnersDir() (p string) {
 
 	// Try a few variations to improve developer experience when building from source in the local tree
 	for _, path := range paths {
-		candidate := filepath.Join(path, "ollama_runners")
+		candidate := filepath.Join(path, "lib", "ollama", "runners")
 		if _, err := os.Stat(candidate); err == nil {
 			p = candidate
 			break
diff --git a/gpu/amd_common.go b/gpu/amd_common.go
index 05747208..72d204f7 100644
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) {
 	// Installer payload location if we're running the installed binary
 	exe, err := os.Executable()
 	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), "ollama_libs")
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
 		if rocmLibUsable(rocmTargetDir) {
 			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
 			return rocmTargetDir, nil
diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go
index 5d25a966..a0ae7c96 100644
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
 	// Installer payload (if we're running from some other location)
 	localAppData := os.Getenv("LOCALAPPDATA")
 	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, "ollama_libs")
+	rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
diff --git a/gpu/gpu.go b/gpu/gpu.go
index eb87807a..391c98a8 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -653,8 +653,8 @@ func GetDepDir() string {
 		slog.Warn("failed to lookup working directory", "error", err)
 	}
 	// Scan for any of our dependeices, and pick first match
-	for _, root := range []string{filepath.Dir(appExe), cwd} {
-		libDep := "ollama_libs"
+	for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} {
+		libDep := filepath.Join("lib", "ollama")
 		if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
 			return filepath.Join(root, libDep)
 		}
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index dc9dda5a..aef03f9a 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -189,7 +189,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
     BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
     export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
-    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/ollama_libs}"
+    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
     build
     install
     echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
@@ -213,7 +213,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
     CC=icx
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
     BUILD_DIR="../build/linux/${ARCH}/oneapi"
-    ONEAPI_DIST_DIR="${DIST_BASE}/ollama_libs"
+    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
     export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
     DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
     build
@@ -260,7 +260,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
         echo "Building custom ROCM GPU"
     fi
     BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    ROCM_DIST_DIR="${DIST_BASE}/ollama_libs"
+    ROCM_DIST_DIR="${DIST_BASE}/lib/ollama"
     # TODO figure out how to disable runpath (rpath)
     # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
     export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 42708d3e..4d43c9e2 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -35,7 +35,7 @@ function init_vars {
         )
     $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
     $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
-    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
+    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
     md "$script:DIST_BASE" -ea 0 > $null
     if ($env:CGO_CFLAGS -contains "-g") {
         $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
@@ -286,11 +286,11 @@ function build_cuda() {
         sign
         install
 
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null
-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
+        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
     } else {
         write-host "Skipping CUDA generation step"
     }
@@ -324,17 +324,17 @@ function build_oneapi() {
     sign
     install
 
-    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
   } else {
     Write-Host "Skipping oneAPI generation step"
   }
@@ -384,11 +384,11 @@ function build_rocm() {
         sign
         install
 
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\" -ea 0 > $null
-        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
-        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
+        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
         # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
-        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\"
+        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
     } else {
         write-host "Skipping ROCm generation step"
     }
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index 50b60230..9cebf1f4 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -122,8 +122,8 @@ function buildOllama() {
             /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     }
-    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
-    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
+    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force
+    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\
 }
 
 function buildApp() {
@@ -142,22 +142,22 @@ function buildApp() {
 function gatherDependencies() {
     write-host "Gathering runtime dependencies"
     cd "${script:SRC_DIR}"
-    md "${script:DEPS_DIR}\ollama_libs" -ea 0 > $null
+    md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null
 
     # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
     # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_libs\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_libs\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_libs\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\"
     foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
-        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_libs\"
+        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\"
     }
 
 
     cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
     if ("${env:KEY_CONTAINER}") {
         write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DEPS_DIR}\ollama_libs\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+        foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
             write-host "signing $file"
             & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                 /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
diff --git a/scripts/install.sh b/scripts/install.sh
index f0439b00..a02a0675 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -66,7 +66,7 @@ fi
 for BINDIR in /usr/local/bin /usr/bin /bin; do
     echo $PATH | grep -q $BINDIR && break || continue
 done
-OLLAMA_INSTALL_DIR=${OLLAMA_INSTALL_DIR:-${BINDIR}}
+OLLAMA_INSTALL_DIR=$(dirname ${BINDIR})
 
 status "Installing ollama to $OLLAMA_INSTALL_DIR"
 $SUDO install -o0 -g0 -m755 -d $BINDIR
@@ -77,18 +77,22 @@ if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-
         "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
         $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
     BUNDLE=1
+    if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
+        status "Making ollama accessible in the PATH in $BINDIR"
+        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
+    fi
 else
     status "Downloading Linux ${ARCH} CLI"
     curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
     "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
     $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
     BUNDLE=0
+    if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
+        status "Making ollama accessible in the PATH in $BINDIR"
+        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
+    fi
 fi
 
-if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
-    status "Making ollama accessible in the PATH in $BINDIR"
-    $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
-fi
 
 install_success() {
     status 'The Ollama API is now available at 127.0.0.1:11434.'

From f9e31da9463092d7b3661594788c259d6d55b3d9 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Thu, 15 Aug 2024 14:38:14 -0700
Subject: [PATCH 19/34] Review comments

---
 .github/workflows/release.yaml | 106 ++++++---------------------------
 docs/linux.md                  |   8 +--
 gpu/cuda_common.go             |   2 +-
 gpu/gpu.go                     |  16 ++---
 llm/generate/gen_windows.ps1   |   4 +-
 5 files changed, 32 insertions(+), 104 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 4bd68455..508fbb35 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -183,10 +183,17 @@ jobs:
           name: windows-rocm-deps
           path: dist/deps/*
 
-  # CUDA v11 generation step
-  generate-windows-cuda-v11:
+  # CUDA generation step
+  generate-windows-cuda:
     environment: release
     runs-on: windows
+    strategy:
+      matrix:
+        cuda:
+          - version: "11"
+            url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
+          - version: "12"
+            url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
     env:
       KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
     steps:
@@ -220,11 +227,11 @@ jobs:
         with:
           go-version-file: go.mod
           cache: true
-      - name: 'Install CUDA'
+      - name: 'Install CUDA ${{ matrix.cuda.version }}'
         run: |
           $ErrorActionPreference = "Stop"
           write-host "downloading CUDA Installer"
-          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
+          Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
           write-host "Installing CUDA"
           Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
           write-host "Completed CUDA"
@@ -256,7 +263,7 @@ jobs:
           cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
       - uses: actions/upload-artifact@v4
         with:
-          name: generate-windows-cuda-v11
+          name: generate-windows-cuda-${{ matrix.cuda.version }}
           path: |
             llm/build/**/bin/*
             dist/windows-amd64/**
@@ -265,95 +272,13 @@ jobs:
           name: windows-cuda-deps
           path: dist/deps/*
 
-  # CUDA v12 generation step
-  generate-windows-cuda-v12:
-    environment: release
-    runs-on: windows
-    env:
-      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          project_id: 'ollama'
-          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
-      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
-      - name: install Windows SDK 8.1 to get signtool
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading SDK"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-          write-host "Win SDK 8.1 installed"
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
-      - name: install signing plugin
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading plugin"
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
-          write-host "Installing plugin"
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
-          write-host "plugin installed"
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - name: 'Install CUDA'
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading CUDA Installer"
-          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
-          write-host "Installing CUDA"
-          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
-          write-host "Completed CUDA"
-          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
-          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
-          echo "$cudaPath\bin" >> $env:GITHUB_PATH
-          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
-      - name: 'Verify CUDA'
-        run: nvcc -V
-      - run: go get ./...
-      - name: go generate
-        run: |
-          $gopath=(get-command go).source | split-path -parent
-          $cudabin=(get-command nvcc).source | split-path
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$cudabin;$env:PATH"
-          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          go generate -x ./...
-      - name: 'gather cuda dependencies'
-        run: |
-          $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
-          md "dist\deps"
-          cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\"
-          cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\"
-          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
-      - uses: actions/upload-artifact@v4
-        with:
-          name: generate-windows-cuda-v12
-          path: |
-            llm/build/**/bin/*
-            dist/windows-amd64/**
-      - uses: actions/upload-artifact@v4
-        with:
-          name: windows-cuda-deps
-          path: dist/deps/*
 
   # Import the prior generation steps and build the final windows assets
   build-windows:
     environment: release
     runs-on: windows
     needs:
-      - generate-windows-cuda-v11
-      - generate-windows-cuda-v12
+      - generate-windows-cuda
       - generate-windows-rocm
       - generate-windows-cpu
     env:
@@ -397,7 +322,10 @@ jobs:
           name: generate-windows-cpu
       - uses: actions/download-artifact@v4
         with:
-          name: generate-windows-cuda-v11
+          name: generate-windows-cuda-11
+      - uses: actions/download-artifact@v4
+        with:
+          name: generate-windows-cuda-12
       - uses: actions/download-artifact@v4
         with:
           name: windows-cuda-deps
diff --git a/docs/linux.md b/docs/linux.md
index 3ed2bed0..d1d5892c 100644
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -20,12 +20,12 @@ GPU.
 
 ## Manual install
 
-### Download the `ollama` tar file
+### Download `ollama`
 
-Ollama is distributed as a tar file including GPU library dependencies.
+Download and extract the Linux package:
 
 ```bash
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf -
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
 ```
 
 ### Adding Ollama as a startup service (recommended)
@@ -95,7 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh
 Or by downloading the ollama binary:
 
 ```bash
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf -
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
 ```
 
 ## Installing specific versions
diff --git a/gpu/cuda_common.go b/gpu/cuda_common.go
index defaa60a..827cc9b4 100644
--- a/gpu/cuda_common.go
+++ b/gpu/cuda_common.go
@@ -28,7 +28,7 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
 
-func cudaGetVariant(gpuInfo CudaGPUInfo) string {
+func cudaVariant(gpuInfo CudaGPUInfo) string {
 	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
 		if CudaTegra != "" {
 			ver := strings.Split(CudaTegra, ".")
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 391c98a8..72d237a6 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -225,7 +225,7 @@ func GetGPUInfo() GpuInfoList {
 			return GpuInfoList{cpus[0].GpuInfo}
 		}
 
-		depPath := GetDepDir()
+		depPath := LibraryDir()
 
 		// Load ALL libraries
 		cHandles = initCudaHandles()
@@ -264,20 +264,20 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.computeMajor = int(memInfo.major)
 				gpuInfo.computeMinor = int(memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
-				cudaVariant := cudaGetVariant(gpuInfo)
+				variant := cudaVariant(gpuInfo)
 				if depPath != "" {
 					gpuInfo.DependencyPath = depPath
 					// Check for variant specific directory
-					if cudaVariant != "" {
-						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+cudaVariant)); err == nil {
-							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+cudaVariant)
+					if variant != "" {
+						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
+							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
 						}
 					}
 				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
-				gpuInfo.Variant = cudaGetVariant(gpuInfo)
+				gpuInfo.Variant = variant
 
 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
@@ -468,7 +468,7 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	slog.Debug("Searching for GPU library", "name", baseLibName)
 
 	// Start with our bundled libraries
-	patterns := []string{filepath.Join(GetDepDir(), baseLibName)}
+	patterns := []string{filepath.Join(LibraryDir(), baseLibName)}
 
 	switch runtime.GOOS {
 	case "windows":
@@ -642,7 +642,7 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	}
 }
 
-func GetDepDir() string {
+func LibraryDir() string {
 	// On Windows/linux we bundle the dependencies at the same level as the executable
 	appExe, err := os.Executable()
 	if err != nil {
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 4d43c9e2..cbdfd09f 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -117,7 +117,7 @@ function build {
     if ($cmakeDefs -contains "-G") {
         $extra=@("-j8")
     } else {
-        $extra= @("--", "/p:CL_MPcount=8")
+        $extra= @("--", "/maxCpuCount:8")
     }
     write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
     & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
@@ -273,7 +273,7 @@ function build_cuda() {
             "-DGGML_CUDA=ON",
             "-DGGML_AVX=on",
             "-DGGML_AVX2=off",
-            "-DCMAKE_CUDA_FLAGS=-t8",
+            "-DCMAKE_CUDA_FLAGS=-t6",
             "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
             "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
             )

From d8be22e47d460d1483846e2effb9b67fbfce1c0b Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 19 Aug 2024 12:07:18 -0700
Subject: [PATCH 20/34] Fix overlapping artifact name on CI

---
 .github/workflows/release.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 508fbb35..f6489dac 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -269,7 +269,7 @@ jobs:
             dist/windows-amd64/**
       - uses: actions/upload-artifact@v4
         with:
-          name: windows-cuda-deps
+          name: windows-cuda-deps-${{ matrix.cuda.version }}
           path: dist/deps/*
 
 
@@ -328,7 +328,10 @@ jobs:
           name: generate-windows-cuda-12
       - uses: actions/download-artifact@v4
         with:
-          name: windows-cuda-deps
+          name: windows-cuda-deps-11
+      - uses: actions/download-artifact@v4
+        with:
+          name: windows-cuda-deps-12
       - uses: actions/download-artifact@v4
         with:
           name: windows-rocm-deps

From f91c9e370923d3b10a88732ab577e2728022152d Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Mon, 19 Aug 2024 13:48:45 -0700
Subject: [PATCH 21/34] CI: handle directories during checksum (#6427)

---
 .github/workflows/release.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index f6489dac..aad49d98 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -472,7 +472,8 @@ jobs:
           merge-multiple: true
       - run: |
           ls -lh dist/
-          (cd dist; sha256sum * > sha256sum.txt)
+          (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
+          mv sha256sum.txt dist/
           cat dist/sha256sum.txt
       - name: Create or update Release
         run: |

From 19e5a890f70b95a55c9de6a55357d78fc0a4ff81 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Mon, 19 Aug 2024 15:19:21 -0700
Subject: [PATCH 22/34] CI: remove directories from dist dir before upload step
 (#6429)

---
 .github/workflows/release.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index aad49d98..2cf4d2c2 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -474,6 +474,7 @@ jobs:
           ls -lh dist/
           (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
           mv sha256sum.txt dist/
+          mv dist/linux-???64 .
           cat dist/sha256sum.txt
       - name: Create or update Release
         run: |

From a017cf2fea4aaa376087520382058c42cffce097 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 20 Aug 2024 07:26:38 -0700
Subject: [PATCH 23/34] Split rocm back out of bundle (#6432)

We're over budget for github's maximum release artifact size with rocm + 2 cuda
versions.  This splits rocm back out as a discrete artifact, but keeps the layout so it can
be extracted into the same location as the main bundle.
---
 .github/workflows/release.yaml | 1 +
 Dockerfile                     | 4 ++--
 llm/generate/gen_linux.sh      | 3 ++-
 scripts/build_linux.sh         | 6 ++++++
 scripts/install.sh             | 5 +++++
 5 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 2cf4d2c2..9c1e3e13 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -475,6 +475,7 @@ jobs:
           (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
           mv sha256sum.txt dist/
           mv dist/linux-???64 .
+          mv dist/linux-amd64-rocm .
           cat dist/sha256sum.txt
       - name: Create or update Release
         run: |
diff --git a/Dockerfile b/Dockerfile
index d4b86918..c46477b4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -95,8 +95,8 @@ ARG AMDGPU_TARGETS
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
-RUN mkdir -p ../../dist/linux-amd64/lib/ollama && \
-    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/lib/ollama && tar xf - )
+RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \
+    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - )
 
 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index aef03f9a..6927dda8 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -260,7 +260,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
         echo "Building custom ROCM GPU"
     fi
     BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    ROCM_DIST_DIR="${DIST_BASE}/lib/ollama"
+    # ROCm dependencies are too large to fit into a unified bundle
+    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
     # TODO figure out how to disable runpath (rpath)
     # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
     export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index adda2ad7..6cb0d0cd 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -24,8 +24,14 @@ for TARGETARCH in ${BUILD_ARCH}; do
     docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
     rm -rf ./dist/linux-$TARGETARCH
     docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
+    if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
+        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
+    fi
     docker rm builder-$TARGETARCH
     echo "Compressing final linux bundle..."
     rm -f ./dist/ollama-linux-$TARGETARCH.tgz
     (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
+    if [ -d dist/linux-$TARGETARCH-rocm ]; then
+        (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
+    fi
 done
diff --git a/scripts/install.sh b/scripts/install.sh
index a02a0675..25f57565 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -199,6 +199,11 @@ fi
 
 if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
     if [ $BUNDLE -ne 0 ]; then
+        status "Downloading Linux ROCm ${ARCH} bundle"
+        curl --fail --show-error --location --progress-bar \
+            "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
+            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+
         install_success
         status "AMD GPU ready."
         exit 0

From 5a28b9cf5fcb3994aa1a143118c73c7d1fbf3bf9 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 6 Jun 2024 08:59:04 -0700
Subject: [PATCH 24/34] bert

---
 convert/convert.go                     |  12 ++
 convert/convert_bert.go                | 176 +++++++++++++++++++++++++
 convert/convert_test.go                |   1 +
 convert/reader.go                      |   2 +
 convert/testdata/all-MiniLM-L6-v2.json | 124 +++++++++++++++++
 convert/tokenizer.go                   |  31 ++---
 6 files changed, 331 insertions(+), 15 deletions(-)
 create mode 100644 convert/convert_bert.go
 create mode 100644 convert/testdata/all-MiniLM-L6-v2.json

diff --git a/convert/convert.go b/convert/convert.go
index 24c19aa4..f51e9665 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -66,6 +66,10 @@ type Converter interface {
 	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
 }
 
+type moreParser interface {
+	parseMore(fs.FS) error
+}
+
 // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
@@ -95,6 +99,8 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		conv = &gemma{}
 	case "Phi3ForCausalLM":
 		conv = &phi3{}
+	case "BertModel":
+		conv = &bert{}
 	default:
 		return errors.New("unsupported architecture")
 	}
@@ -103,6 +109,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		return err
 	}
 
+	if t, ok := conv.(moreParser); ok {
+		if err := t.parseMore(fsys); err != nil {
+			return err
+		}
+	}
+
 	t, err := parseTokenizer(fsys, conv.specialTokenTypes())
 	if err != nil {
 		return err
diff --git a/convert/convert_bert.go b/convert/convert_bert.go
new file mode 100644
index 00000000..62fad147
--- /dev/null
+++ b/convert/convert_bert.go
@@ -0,0 +1,176 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"io/fs"
+	"path/filepath"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type bert struct {
+	Parameters
+	NLayers               uint32  `json:"n_layers"`
+	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
+	NLayer                uint32  `json:"n_layer"`
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	NCtx                  uint32  `json:"n_ctx"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	NEmbd                 uint32  `json:"n_embd"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NInner                uint32  `json:"n_inner"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NHead                 uint32  `json:"n_head"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	LayerNormEPS          float32 `json:"layer_norm_eps"`
+	LayerNormEpsilon      float32 `json:"layer_norm_epsilon"`
+	NormEpsilon           float32 `json:"norm_epsilon"`
+
+	PoolingType uint32
+}
+
+var (
+	_ Converter  = (*bert)(nil)
+	_ moreParser = (*bert)(nil)
+)
+
+func (p *bert) parseMore(fsys fs.FS) error {
+	bts, err := fs.ReadFile(fsys, "modules.json")
+	if err != nil {
+		return err
+	}
+
+	var modules []struct {
+		Type string `json:"type"`
+		Path string `json:"path"`
+	}
+
+	if err := json.Unmarshal(bts, &modules); err != nil {
+		return err
+	}
+
+	var pooling string
+	for _, m := range modules {
+		if m.Type == "sentence_transformers.models.Pooling" {
+			pooling = m.Path
+			break
+		}
+	}
+
+	if pooling != "" {
+		bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
+		if err != nil {
+			return err
+		}
+
+		var pc struct {
+			PoolingModeCLSToken   bool `json:"pooling_mode_cls_token"`
+			PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
+		}
+
+		if err := json.Unmarshal(bts, &pc); err != nil {
+			return err
+		}
+
+		if pc.PoolingModeMeanTokens {
+			p.PoolingType = 1
+		} else if pc.PoolingModeCLSToken {
+			p.PoolingType = 2
+		}
+	}
+
+	return nil
+}
+
+func (p *bert) KV(t *Tokenizer) llm.KV {
+	kv := p.Parameters.KV(t)
+	kv["general.architecture"] = "bert"
+	kv["general.name"] = "bert"
+	kv["bert.attention.causal"] = false
+	kv["bert.pooling_type"] = p.PoolingType
+
+	kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
+
+	if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
+		kv["bert.context_length"] = contextLength
+	}
+
+	if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
+		kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
+	}
+
+	if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
+		kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
+	}
+
+	if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 {
+		kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
+	}
+
+	if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
+		kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon
+	}
+
+	kv["tokenizer.ggml.model"] = "bert"
+	kv["tokenizer.ggml.token_type_count"] = uint32(2)
+
+	// convert to phantom space tokens
+	for i, e := range t.Tokens {
+		if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") {
+			// noop
+		} else if strings.HasPrefix(e, "##") {
+			t.Tokens[i] = e[2:]
+		} else {
+			t.Tokens[i] = "\u2581" + e
+		}
+	}
+
+	kv["tokenizer.ggml.tokens"] = t.Tokens
+
+	return kv
+}
+
+func (p *bert) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
+	for _, t := range ts {
+		if slices.Contains([]string{
+			"embeddings.position_ids",
+			"pooler.dense.weight",
+			"pooler.dense.bias",
+		}, t.Name()) {
+			continue
+		}
+
+		name := p.tensorName(t.Name())
+		out = append(out, llm.Tensor{
+			Name:     name,
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (bert) tensorName(n string) string {
+	return strings.NewReplacer(
+		"encoder.layer", "blk",
+		"encoder.layers", "blk",
+		"embeddings.word_embeddings", "token_embd",
+		"embeddings.token_type_embeddings", "token_types",
+		"embeddings.LayerNorm", "token_embd_norm",
+		"embeddings.position_embeddings", "position_embd",
+		"attention.self.query", "attn_q",
+		"attention.self.key", "attn_k",
+		"attention.self.value", "attn_v",
+		"attention.output.dense", "attn_output",
+		"attention.output.LayerNorm", "attn_output_norm",
+		"intermediate.dense", "ffn_up",
+		"output.dense", "ffn_down",
+		"output.LayerNorm", "layer_output_norm",
+	).Replace(n)
+}
diff --git a/convert/convert_test.go b/convert/convert_test.go
index cb2c585e..e3ab0098 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -67,6 +67,7 @@ func TestConvertFull(t *testing.T) {
 		"gemma-2b-it",
 		// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
 		"Phi-3-mini-128k-instruct",
+		"all-MiniLM-L6-v2",
 	}
 
 	for i := range cases {
diff --git a/convert/reader.go b/convert/reader.go
index ce95208e..294a7c40 100644
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -37,6 +37,8 @@ const (
 func (t tensorBase) Kind() uint32 {
 	if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
 		return 0
+	} else if t.name == "embeddings.token_type_embeddings.weight" {
+		return 0
 	}
 
 	switch len(t.shape) {
diff --git a/convert/testdata/all-MiniLM-L6-v2.json b/convert/testdata/all-MiniLM-L6-v2.json
new file mode 100644
index 00000000..15c8f039
--- /dev/null
+++ b/convert/testdata/all-MiniLM-L6-v2.json
@@ -0,0 +1,124 @@
+{
+  "general.architecture": "bert",
+  "general.file_type": "1",
+  "general.quantization_version": "2",
+  "bert.attention.causal": "false",
+  "bert.attention.head_count": "12",
+  "bert.attention.layer_norm_epsilon": "1e-12",
+  "bert.block_count": "6",
+  "bert.context_length": "512",
+  "bert.embedding_length": "384",
+  "bert.feed_forward_length": "1536",
+  "bert.pooling_type": "1",
+  "tokenizer.ggml.model": "bert",
+  "tokenizer.ggml.padding_token_id": "0",
+  "tokenizer.ggml.unknown_token_id": "100",
+  "tokenizer.ggml.cls_token_id": "101",
+  "tokenizer.ggml.seperator_token_id": "102",
+  "tokenizer.ggml.mask_token_id": "103",
+  "tokenizer.ggml.token_type_count": "2",
+  "tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f",
+  "tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1",
+  "tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86",
+  "token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a",
+  "position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6",
+  "token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21",
+  "token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4",
+  "token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d",
+  "blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c",
+  "blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62",
+  "blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba",
+  "blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9",
+  "blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3",
+  "blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb",
+  "blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802",
+  "blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701",
+  "blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62",
+  "blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50",
+  "blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874",
+  "blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34",
+  "blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca",
+  "blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054",
+  "blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8",
+  "blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a",
+  "blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10",
+  "blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069",
+  "blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d",
+  "blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f",
+  "blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21",
+  "blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199",
+  "blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73",
+  "blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d",
+  "blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8",
+  "blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d",
+  "blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7",
+  "blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a",
+  "blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613",
+  "blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88",
+  "blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd",
+  "blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164",
+  "blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e",
+  "blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9",
+  "blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289",
+  "blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d",
+  "blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993",
+  "blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab",
+  "blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79",
+  "blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3",
+  "blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21",
+  "blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08",
+  "blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59",
+  "blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252",
+  "blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef",
+  "blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27",
+  "blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35",
+  "blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247",
+  "blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6",
+  "blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb",
+  "blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18",
+  "blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e",
+  "blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b",
+  "blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1",
+  "blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502",
+  "blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae",
+  "blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0",
+  "blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40",
+  "blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0",
+  "blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b",
+  "blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847",
+  "blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de",
+  "blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482",
+  "blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab",
+  "blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc",
+  "blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb",
+  "blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16",
+  "blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60",
+  "blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a",
+  "blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6",
+  "blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44",
+  "blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222",
+  "blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44",
+  "blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374",
+  "blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223",
+  "blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017",
+  "blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21",
+  "blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1",
+  "blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f",
+  "blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c",
+  "blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d",
+  "blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b",
+  "blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5",
+  "blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77",
+  "blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923",
+  "blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c",
+  "blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471",
+  "blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219",
+  "blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271",
+  "blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712",
+  "blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430",
+  "blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370",
+  "blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368",
+  "blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291",
+  "blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a",
+  "blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766"
+}
diff --git a/convert/tokenizer.go b/convert/tokenizer.go
index 0d42a6d8..653df6d2 100644
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -1,7 +1,6 @@
 package convert
 
 import (
-	"cmp"
 	"crypto/sha256"
 	"encoding/hex"
 	"encoding/json"
@@ -11,6 +10,8 @@ import (
 	"log/slog"
 	"os"
 	"slices"
+
+	"golang.org/x/exp/maps"
 )
 
 const (
@@ -184,32 +185,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
 		return nil, err
 	}
 
-	var tokens []token
+	tokens := make(map[int]token, len(t.Model.Vocab))
 	for k, v := range t.Model.Vocab {
-		tokens = append(tokens, token{
+		tokens[v] = token{
 			ID:      v,
 			Content: k,
-		})
+		}
 	}
 
-	for _, t := range t.AddedTokens {
-		t.UserDefined = true
-		tokens = append(tokens, t)
+	for _, token := range t.AddedTokens {
+		token.UserDefined = true
+		tokens[token.ID] = token
 	}
 
-	slices.SortFunc(tokens, func(i, j token) int {
-		return cmp.Compare(i.ID, j.ID)
-	})
+	keys := maps.Keys(tokens)
+	slices.Sort(keys)
 
 	v := Vocabulary{Model: "gpt2"}
-	for _, t := range tokens {
-		v.Tokens = append(v.Tokens, t.Content)
-		v.Scores = append(v.Scores, float32(t.ID))
+	for _, k := range keys {
+		token := tokens[k]
+		v.Tokens = append(v.Tokens, token.Content)
+		v.Scores = append(v.Scores, float32(token.ID))
 
 		switch {
-		case t.Special:
+		case token.Special:
 			v.Types = append(v.Types, tokenTypeControl)
-		case t.UserDefined:
+		case token.UserDefined:
 			v.Types = append(v.Types, tokenTypeUserDefined)
 		default:
 			v.Types = append(v.Types, tokenTypeNormal)

From beb49eef65acefc64a6ae0562ce58467e6974fde Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 7 Jun 2024 14:55:56 -0700
Subject: [PATCH 25/34] create bert models from cli

---
 cmd/cmd.go | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index fd7246c8..a8a02605 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -223,6 +223,14 @@ func tempZipFiles(path string) (string, error) {
 	}
 	files = append(files, js...)
 
+	// bert models require a nested config.json
+	// TODO(mxyng): merge this with the glob above
+	js, err = glob(filepath.Join(path, "**/*.json"), "text/plain")
+	if err != nil {
+		return "", err
+	}
+	files = append(files, js...)
+
 	if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
 		// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
 		// tokenizer.model might be a unresolved git lfs reference; error if it is
@@ -252,6 +260,11 @@ func tempZipFiles(path string) (string, error) {
 			return "", err
 		}
 
+		zfi.Name, err = filepath.Rel(path, file)
+		if err != nil {
+			return "", err
+		}
+
 		zf, err := zipfile.CreateHeader(zfi)
 		if err != nil {
 			return "", err

From 3546bbd08c52df73eb6523b06b13f1b2dfeaa5fb Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 28 Jun 2024 13:27:05 -0700
Subject: [PATCH 26/34] convert gemma2

---
 convert/convert.go                  | 11 ++++++--
 convert/convert_bert.go             |  9 +++---
 convert/convert_gemma.go            | 14 ++++-----
 convert/convert_gemma2.go           | 44 +++++++++++++++++++++++++++++
 convert/convert_llama.go            | 19 ++++++-------
 convert/convert_mixtral.go          |  9 ++++--
 convert/convert_phi3.go             | 11 ++++----
 convert/convert_test.go             |  1 +
 convert/reader.go                   | 12 ++++----
 convert/reader_safetensors.go       |  5 ++--
 convert/reader_torch.go             |  5 ++--
 convert/testdata/gemma-2-9b-it.json |  6 ++++
 convert/tokenizer_spm.go            | 32 ++++++++++++++++++++-
 13 files changed, 132 insertions(+), 46 deletions(-)
 create mode 100644 convert/convert_gemma2.go
 create mode 100644 convert/testdata/gemma-2-9b-it.json

diff --git a/convert/convert.go b/convert/convert.go
index f51e9665..5a314cdd 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -7,6 +7,7 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
+	"strings"
 
 	"github.com/ollama/ollama/llm"
 )
@@ -58,11 +59,13 @@ type Converter interface {
 	KV(*Tokenizer) llm.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
 	Tensors([]Tensor) []llm.Tensor
+	// Replacements returns a list of string pairs to replace in tensor names.
+	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
+	Replacements() []string
 
-	// tensorName returns the LLM tensor name for a specific input name
-	tensorName(string) string
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
+	// writeFile writes the model to the provided io.WriteSeeker
 	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
 }
 
@@ -97,6 +100,8 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		conv = &mixtral{}
 	case "GemmaForCausalLM":
 		conv = &gemma{}
+	case "Gemma2ForCausalLM":
+		conv = &gemma2{}
 	case "Phi3ForCausalLM":
 		conv = &phi3{}
 	case "BertModel":
@@ -131,7 +136,7 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
 
-	ts, err := parseTensors(fsys)
+	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
 	if err != nil {
 		return err
 	}
diff --git a/convert/convert_bert.go b/convert/convert_bert.go
index 62fad147..4547a705 100644
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -144,9 +144,8 @@ func (p *bert) Tensors(ts []Tensor) []llm.Tensor {
 			continue
 		}
 
-		name := p.tensorName(t.Name())
 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -156,8 +155,8 @@ func (p *bert) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }
 
-func (bert) tensorName(n string) string {
-	return strings.NewReplacer(
+func (bert) Replacements() []string {
+	return []string{
 		"encoder.layer", "blk",
 		"encoder.layers", "blk",
 		"embeddings.word_embeddings", "token_embd",
@@ -172,5 +171,5 @@ func (bert) tensorName(n string) string {
 		"intermediate.dense", "ffn_up",
 		"output.dense", "ffn_down",
 		"output.LayerNorm", "layer_output_norm",
-	).Replace(n)
+	}
 }
diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go
index 9213e157..333e4c83 100644
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -44,15 +44,14 @@ func (p *gemma) KV(t *Tokenizer) llm.KV {
 }
 
 func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+	out := make([]llm.Tensor, 0, len(ts))
 	for _, t := range ts {
-		name := p.tensorName(t.Name())
-		if strings.HasSuffix(name, "_norm.weight") {
+		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}
 
 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -62,8 +61,8 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }
 
-func (p *gemma) tensorName(n string) string {
-	return strings.NewReplacer(
+func (p *gemma) Replacements() []string {
+	return []string{
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
 		"model.layers", "blk",
@@ -76,8 +75,7 @@ func (p *gemma) tensorName(n string) string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-		"block_sparse_moe.gate", "ffn_inp",
-	).Replace(n)
+	}
 }
 
 func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go
new file mode 100644
index 00000000..66be02d6
--- /dev/null
+++ b/convert/convert_gemma2.go
@@ -0,0 +1,44 @@
+package convert
+
+import (
+	"github.com/ollama/ollama/llm"
+)
+
+type gemma2 struct {
+	gemma
+	SlidingWindow         uint32  `json:"sliding_window"`
+	AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
+	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
+}
+
+func (p *gemma2) KV(t *Tokenizer) llm.KV {
+	kv := p.Parameters.KV(t)
+	kv["general.architecture"] = "gemma2"
+	kv["general.name"] = "gemma2"
+	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
+	kv["gemma2.embedding_length"] = p.HiddenSize
+	kv["gemma2.block_count"] = p.HiddenLayers
+	kv["gemma2.feed_forward_length"] = p.IntermediateSize
+	kv["gemma2.attention.head_count"] = p.NumAttentionHeads
+	kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads
+	kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+	kv["gemma2.attention.key_length"] = p.HeadDim
+	kv["gemma2.attention.value_length"] = p.HeadDim
+	kv["gemma2.attention.sliding_window"] = p.SlidingWindow
+	kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap
+	kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap
+	kv["tokenizer.ggml.eot_token_id"] = uint32(107)
+	kv["tokenizer.ggml.middle_token_id"] = uint32(68)
+	kv["tokenizer.ggml.prefix_token_id"] = uint32(67)
+	kv["tokenizer.ggml.suffix_token_id"] = uint32(69)
+	return kv
+}
+
+func (p *gemma2) Replacements() []string {
+	return append(
+		p.gemma.Replacements(),
+		"post_attention_layernorm", "post_attention_norm",
+		"pre_feedforward_layernorm", "ffn_norm",
+		"post_feedforward_layernorm", "post_ffw_norm",
+	)
+}
diff --git a/convert/convert_llama.go b/convert/convert_llama.go
index 178b13f3..498d1321 100644
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -96,14 +96,13 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
 	for _, t := range ts {
-		name := p.tensorName(t.Name())
-		if strings.HasSuffix(name, "attn_q.weight") ||
-			strings.HasSuffix(name, "attn_k.weight") {
+		if strings.HasSuffix(t.Name(), "attn_q.weight") ||
+			strings.HasSuffix(t.Name(), "attn_k.weight") {
 			t.SetRepacker(p.repack)
 		}
 
 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -113,8 +112,8 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }
 
-func (p *llama) tensorName(n string) string {
-	return strings.NewReplacer(
+func (p *llama) Replacements() []string {
+	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
@@ -128,9 +127,7 @@ func (p *llama) tensorName(n string) string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-		// mixtral
-		"block_sparse_moe.gate", "ffn_gate_inp",
-	).Replace(n)
+	}
 }
 
 func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
@@ -140,9 +137,9 @@ func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32,
 	}
 
 	var heads uint32
-	if strings.HasSuffix(name, "q_proj.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "k_proj.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go
index 3263a27b..97a86b30 100644
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -15,8 +15,6 @@ type mixtral struct {
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }
 
-var _ Converter = (*mixtral)(nil)
-
 func (p *mixtral) KV(t *Tokenizer) llm.KV {
 	kv := p.llama.KV(t)
 
@@ -72,6 +70,13 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
 	return append(out, p.llama.Tensors(ts)...)
 }
 
+func (p *mixtral) Replacements() []string {
+	return append(
+		p.llama.Replacements(),
+		"block_sparse_moe.gate", "ffn_gate_inp",
+	)
+}
+
 type experts []Tensor
 
 func (e experts) WriteTo(w io.Writer) (int64, error) {
diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go
index 0f645217..4ee59ff5 100644
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -74,8 +74,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
 
 	out := make([]llm.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
-		name := p.tensorName(t.Name())
-		if strings.HasPrefix(name, "blk.0.") {
+		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
 				out = append(out, llm.Tensor{
 					Name:     "rope_factors_long.weight",
@@ -92,7 +91,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
 		}
 
 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@@ -102,8 +101,8 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }
 
-func (p *phi3) tensorName(n string) string {
-	return strings.NewReplacer(
+func (p *phi3) Replacements() []string {
+	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
@@ -114,7 +113,7 @@ func (p *phi3) tensorName(n string) string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.gate_up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-	).Replace(n)
+	}
 }
 
 type ropeFactor []float32
diff --git a/convert/convert_test.go b/convert/convert_test.go
index e3ab0098..e78afab7 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -68,6 +68,7 @@ func TestConvertFull(t *testing.T) {
 		// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
 		"Phi-3-mini-128k-instruct",
 		"all-MiniLM-L6-v2",
+		"gemma-2-9b-it",
 	}
 
 	for i := range cases {
diff --git a/convert/reader.go b/convert/reader.go
index 294a7c40..5bba0406 100644
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -35,9 +35,9 @@ const (
 )
 
 func (t tensorBase) Kind() uint32 {
-	if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
-		return 0
-	} else if t.name == "embeddings.token_type_embeddings.weight" {
+	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
+		t.name == "token_types.weight" {
+		// these tensors are always F32
 		return 0
 	}
 
@@ -57,10 +57,10 @@ func (t *tensorBase) SetRepacker(fn repacker) {
 
 type repacker func(string, []float32, []uint64) ([]float32, error)
 
-func parseTensors(fsys fs.FS) ([]Tensor, error) {
+func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
 	patterns := []struct {
 		Pattern string
-		Func    func(fs.FS, ...string) ([]Tensor, error)
+		Func    func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
 	}{
 		{"model-*-of-*.safetensors", parseSafetensors},
 		{"model.safetensors", parseSafetensors},
@@ -76,7 +76,7 @@ func parseTensors(fsys fs.FS) ([]Tensor, error) {
 		}
 
 		if len(matches) > 0 {
-			return pattern.Func(fsys, matches...)
+			return pattern.Func(fsys, replacer, matches...)
 		}
 	}
 
diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go
index 42f902a5..32a362cd 100644
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -8,6 +8,7 @@ import (
 	"io"
 	"io/fs"
 	"slices"
+	"strings"
 
 	"github.com/d4l3k/go-bfloat16"
 	"github.com/x448/float16"
@@ -20,7 +21,7 @@ type safetensorMetadata struct {
 	Offsets []int64  `json:"data_offsets"`
 }
 
-func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
+func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
 	var ts []Tensor
 	for _, p := range ps {
 		f, err := fsys.Open(p)
@@ -56,7 +57,7 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
 					offset: safetensorsPad(n, value.Offsets[0]),
 					size:   safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
 					tensorBase: &tensorBase{
-						name:  key,
+						name:  replacer.Replace(key),
 						shape: value.Shape,
 					},
 				})
diff --git a/convert/reader_torch.go b/convert/reader_torch.go
index 531996bf..1b3e1c9f 100644
--- a/convert/reader_torch.go
+++ b/convert/reader_torch.go
@@ -3,12 +3,13 @@ package convert
 import (
 	"io"
 	"io/fs"
+	"strings"
 
 	"github.com/nlpodyssey/gopickle/pytorch"
 	"github.com/nlpodyssey/gopickle/types"
 )
 
-func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
+func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
 	var ts []Tensor
 	for _, p := range ps {
 		pt, err := pytorch.Load(p)
@@ -27,7 +28,7 @@ func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
 			ts = append(ts, torch{
 				storage: t.(*pytorch.Tensor).Source,
 				tensorBase: &tensorBase{
-					name:  k.(string),
+					name:  replacer.Replace(k.(string)),
 					shape: shape,
 				},
 			})
diff --git a/convert/testdata/gemma-2-9b-it.json b/convert/testdata/gemma-2-9b-it.json
new file mode 100644
index 00000000..90cdbee4
--- /dev/null
+++ b/convert/testdata/gemma-2-9b-it.json
@@ -0,0 +1,6 @@
+{
+  "general.architecture": "gemma2",
+  "gemma2.attention.sliding_window": "4096",
+  "gemma2.attn_logit_softcapping": "50",
+  "gemma2.final_logit_softcapping": "30"
+}
diff --git a/convert/tokenizer_spm.go b/convert/tokenizer_spm.go
index babf702c..5e506087 100644
--- a/convert/tokenizer_spm.go
+++ b/convert/tokenizer_spm.go
@@ -15,6 +15,11 @@ import (
 )
 
 func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
+	ast, err := parseAdditionalSpecialTokens(fsys)
+	if err != nil {
+		return nil, err
+	}
+
 	bts, err := fs.ReadFile(fsys, "tokenizer.model")
 	if err != nil {
 		return nil, err
@@ -37,7 +42,12 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 			sentencepiece.ModelProto_SentencePiece_BYTE:
 			v.Types = append(v.Types, int32(t))
 		default:
-			v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL))
+			tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
+			if slices.Contains(ast, piece.GetPiece()) {
+				tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
+			}
+
+			v.Types = append(v.Types, tt)
 		}
 	}
 
@@ -81,3 +91,23 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 
 	return &v, nil
 }
+
+func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
+	f, err := fsys.Open("special_tokens_map.json")
+	if errors.Is(err, os.ErrNotExist) {
+		return nil, nil
+	} else if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	var m struct {
+		AdditionalSpecialTokens []string `json:"additional_special_tokens"`
+	}
+
+	if err := json.NewDecoder(f).Decode(&m); err != nil {
+		return nil, err
+	}
+
+	return m.AdditionalSpecialTokens, nil
+}

From 77903ab8b4fb8075faad7bde5bde2eee3173e407 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 29 Jul 2024 14:53:02 -0700
Subject: [PATCH 27/34] llama3.1

---
 convert/convert_bert.go                       |  1 -
 convert/convert_gemma.go                      |  1 -
 convert/convert_gemma2.go                     |  1 -
 convert/convert_llama.go                      | 43 +++++++++++++++++--
 convert/convert_phi3.go                       |  1 -
 convert/convert_test.go                       |  1 +
 .../testdata/Meta-Llama-3.1-8B-Instruct.json  |  3 ++
 llm/memory_test.go                            |  1 -
 server/sched_test.go                          |  1 -
 9 files changed, 44 insertions(+), 9 deletions(-)
 create mode 100644 convert/testdata/Meta-Llama-3.1-8B-Instruct.json

diff --git a/convert/convert_bert.go b/convert/convert_bert.go
index 4547a705..6e7d59fe 100644
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -88,7 +88,6 @@ func (p *bert) parseMore(fsys fs.FS) error {
 func (p *bert) KV(t *Tokenizer) llm.KV {
 	kv := p.Parameters.KV(t)
 	kv["general.architecture"] = "bert"
-	kv["general.name"] = "bert"
 	kv["bert.attention.causal"] = false
 	kv["bert.pooling_type"] = p.PoolingType
 
diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go
index 333e4c83..c4316808 100644
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -26,7 +26,6 @@ var _ Converter = (*gemma)(nil)
 func (p *gemma) KV(t *Tokenizer) llm.KV {
 	kv := p.Parameters.KV(t)
 	kv["general.architecture"] = "gemma"
-	kv["general.name"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma.embedding_length"] = p.HiddenSize
 	kv["gemma.block_count"] = p.HiddenLayers
diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go
index 66be02d6..084f9c52 100644
--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -14,7 +14,6 @@ type gemma2 struct {
 func (p *gemma2) KV(t *Tokenizer) llm.KV {
 	kv := p.Parameters.KV(t)
 	kv["general.architecture"] = "gemma2"
-	kv["general.name"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma2.embedding_length"] = p.HiddenSize
 	kv["gemma2.block_count"] = p.HiddenLayers
diff --git a/convert/convert_llama.go b/convert/convert_llama.go
index 498d1321..27f924fb 100644
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -3,6 +3,7 @@ package convert
 import (
 	"cmp"
 	"fmt"
+	"math"
 	"strings"
 
 	"github.com/pdevine/tensor"
@@ -27,8 +28,14 @@ type llama struct {
 	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
 	RopeTheta             float32 `json:"rope_theta"`
 	RopeScaling           struct {
-		Type   string  `json:"type"`
-		Factor float32 `json:"factor"`
+		Type                            string  `json:"type"`
+		RopeType                        string  `json:"rope_type"`
+		Factor                          float32 `json:"factor"`
+		LowFrequencyFactor              float32 `json:"low_freq_factor"`
+		HighFrequencyFactor             float32 `json:"high_freq_factor"`
+		OriginalMaxPositionalEmbeddings uint32  `json:"original_max_positional_embeddings"`
+
+		factors ropeFactor
 	} `json:"rope_scaling"`
 	RMSNormEPS       float32 `json:"rms_norm_eps"`
 	LayerNormEPS     float32 `json:"layer_norm_eps"`
@@ -42,7 +49,6 @@ var _ Converter = (*llama)(nil)
 func (p *llama) KV(t *Tokenizer) llm.KV {
 	kv := p.Parameters.KV(t)
 	kv["general.architecture"] = "llama"
-	kv["general.name"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize
 
 	kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
@@ -71,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 	if p.RopeScaling.Type == "linear" {
 		kv["llama.rope.scaling.type"] = p.RopeScaling.Type
 		kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
+	} else if p.RopeScaling.RopeType == "llama3" {
+		dim := p.HiddenSize / p.NumAttentionHeads
+		for i := uint32(0); i < dim; i += 2 {
+			factor := cmp.Or(p.RopeScaling.Factor, 8.0)
+			factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
+			factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
+
+			original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
+			lambdaLow := float32(original) / factorLow
+			lambdaHigh := float32(original) / factorHigh
+
+			lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim))
+			if lambda < float64(lambdaHigh) {
+				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
+			} else if lambda > float64(lambdaLow) {
+				p.RopeScaling.factors = append(p.RopeScaling.factors, factor)
+			} else {
+				smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow)
+				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth))
+			}
+		}
 	}
 
 	if p.NumKeyValueHeads > 0 {
@@ -95,6 +122,16 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 
 func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
+
+	if p.RopeScaling.factors != nil {
+		out = append(out, llm.Tensor{
+			Name:     "rope_freqs.weight",
+			Kind:     0,
+			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
+			WriterTo: p.RopeScaling.factors,
+		})
+	}
+
 	for _, t := range ts {
 		if strings.HasSuffix(t.Name(), "attn_q.weight") ||
 			strings.HasSuffix(t.Name(), "attn_k.weight") {
diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go
index 4ee59ff5..64d3d012 100644
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -40,7 +40,6 @@ var _ Converter = (*phi3)(nil)
 func (p *phi3) KV(t *Tokenizer) llm.KV {
 	kv := p.Parameters.KV(t)
 	kv["general.architecture"] = "phi3"
-	kv["general.name"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
 	kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
 	kv["phi3.feed_forward_length"] = p.IntermediateSize
diff --git a/convert/convert_test.go b/convert/convert_test.go
index e78afab7..64b7df3b 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -62,6 +62,7 @@ func TestMain(m *testing.M) {
 func TestConvertFull(t *testing.T) {
 	cases := []string{
 		"Meta-Llama-3-8B-Instruct",
+		"Meta-Llama-3.1-8B-Instruct",
 		"Mistral-7B-Instruct-v0.2",
 		"Mixtral-8x7B-Instruct-v0.1",
 		"gemma-2b-it",
diff --git a/convert/testdata/Meta-Llama-3.1-8B-Instruct.json b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json
new file mode 100644
index 00000000..ad7cd20a
--- /dev/null
+++ b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json
@@ -0,0 +1,3 @@
+{
+  "rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222"
+}
diff --git a/llm/memory_test.go b/llm/memory_test.go
index 6cf0119f..ffb14286 100644
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -33,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) {
 	assert.Len(t, tensors, inputLayerCount+1)
 	err = WriteGGUF(f, KV{
 		"general.architecture":          "llama",
-		"general.name":                  "name",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.block_count":             uint32(inputLayerCount),
diff --git a/server/sched_test.go b/server/sched_test.go
index 713b9259..fb049574 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -117,7 +117,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 
 	require.NoError(t, llm.WriteGGUF(f, llm.KV{
 		"general.architecture":          "llama",
-		"general.name":                  "name",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.block_count":             uint32(1),

From 90ca84172c2a98ecfd76eb7e05cd3e33e1dde507 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 22 Aug 2024 14:51:42 -0700
Subject: [PATCH 28/34] Fix embeddings memory corruption (#6467)

* Fix embeddings memory corruption

The patch was leading to a buffer overrun corruption.  Once removed though, parallism
in server.cpp lead to hitting an assert due to slot/seq IDs being >= token count.  To
work around this, only use slot 0 for embeddings.

* Fix embed integration test assumption

The token eval count has changed with recent llama.cpp bumps (0.3.5+)
---
 integration/embed_test.go   |  8 ++---
 llm/ext_server/server.cpp   |  8 ++++-
 llm/patches/08-pooling.diff | 60 -------------------------------------
 server/sched.go             |  5 ++++
 4 files changed, 16 insertions(+), 65 deletions(-)
 delete mode 100644 llm/patches/08-pooling.diff

diff --git a/integration/embed_test.go b/integration/embed_test.go
index 10333d5d..4a68af68 100644
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
 		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
 	}
 
-	if res.PromptEvalCount != 8 {
-		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 6 {
+		t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }
 
@@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
 	}
 
-	if res.PromptEvalCount != 16 {
-		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 12 {
+		t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }
 
diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index 5717c17a..8e08b850 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1429,7 +1429,13 @@ struct llama_server_context
         switch (task.type)
         {
             case TASK_TYPE_COMPLETION: {
-                server_slot *slot = prefix_slot(task.data["prompt"]);
+                server_slot *slot = nullptr;
+                if (task.embedding_mode) {
+                    // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
+                    slot = slots[0].available() ? &slots[0] : nullptr;
+                } else {
+                    slot = prefix_slot(task.data["prompt"]);
+                }
                 if (slot == nullptr)
                 {
                     // if no slot is available, we defer this task for processing later
diff --git a/llm/patches/08-pooling.diff b/llm/patches/08-pooling.diff
deleted file mode 100644
index 2e4fe11e..00000000
--- a/llm/patches/08-pooling.diff
+++ /dev/null
@@ -1,60 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 721b8f4e..cfe7ac40 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -8420,14 +8420,14 @@ struct llm_build_context {
-     }
- 
-     struct ggml_tensor * build_inp_mean() {
--        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
-+        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
-         cb(lctx.inp_mean, "inp_mean", -1);
-         ggml_set_input(lctx.inp_mean);
-         return lctx.inp_mean;
-     }
- 
-     struct ggml_tensor * build_inp_cls() {
--        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-+        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
-         cb(lctx.inp_cls, "inp_cls", -1);
-         ggml_set_input(lctx.inp_cls);
-         return lctx.inp_cls;
-@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
-         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
- 
-         float * data = (float *) lctx.inp_mean->data;
--        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
-+        memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
- 
-         std::vector<uint64_t> sum(n_tokens, 0);
-         for (int i = 0; i < n_tokens; ++i) {
-             const llama_seq_id seq_id = batch.seq_id[i][0];
--
--            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
--
-             sum[seq_id] += 1;
-         }
- 
--        std::vector<float> div(n_tokens, 0.0f);
--        for (int i = 0; i < n_tokens; ++i) {
-+        std::vector<float> div(cparams.n_seq_max, 0.0f);
-+        for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
-             const uint64_t s = sum[i];
-             if (s > 0) {
-                 div[i] = 1.0f/float(s);
-@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
-         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
- 
-         uint32_t * data = (uint32_t *) lctx.inp_cls->data;
--        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
-+        memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
- 
-         for (int i = 0; i < n_tokens; ++i) {
-             const llama_seq_id seq_id = batch.seq_id[i][0];
-             const llama_pos    pos    = batch.pos[i];
--
--            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
--
-             if (pos == 0) {
-                 data[seq_id] = i;
-             }
diff --git a/server/sched.go b/server/sched.go
index 9d8c4144..58071bf0 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}
 
+					// Embedding models should always be loaded with parallel=1
+					if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
+						numParallel = 1
+					}
+
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
 						// simplifying assumption of defaultParallel when in CPU mode

From 0b03b9c32f483be2d7a4e902d13a909b546ae6bf Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 23 Aug 2024 11:20:39 -0700
Subject: [PATCH 29/34] llm: Align cmake define for cuda no peer copy (#6455)

Define changed recently and this slipped through the cracks with the old
name.
---
 llm/generate/gen_linux.sh    | 2 +-
 llm/generate/gen_windows.ps1 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 6927dda8..1f702ca2 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -252,7 +252,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
         ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
     fi
     init_vars
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
     # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
     if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
         echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index cbdfd09f..7179c1bc 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -355,7 +355,7 @@ function build_rocm() {
             "-DCMAKE_C_COMPILER=clang.exe",
             "-DCMAKE_CXX_COMPILER=clang++.exe",
             "-DGGML_HIPBLAS=on",
-            "-DLLAMA_CUDA_NO_PEER_COPY=on",
+            "-DGGML_CUDA_NO_PEER_COPY=on",
             "-DHIP_PLATFORM=amd",
             "-DGGML_AVX=on",
             "-DGGML_AVX2=off",

From 7a1e1c1cafe4d3f3f935dc7192f9e66d4b2185b3 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 23 Aug 2024 11:21:12 -0700
Subject: [PATCH 30/34] gpu: Ensure driver version set before variant (#6480)

During rebasing, the ordering was inverted causing the cuda version
selection logic to break, with driver version being evaluated as zero
incorrectly causing a downgrade to v11.
---
 gpu/gpu.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu/gpu.go b/gpu/gpu.go
index 72d237a6..10afb1e3 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -264,6 +264,8 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.computeMajor = int(memInfo.major)
 				gpuInfo.computeMinor = int(memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
+				gpuInfo.DriverMajor = driverMajor
+				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
 				if depPath != "" {
 					gpuInfo.DependencyPath = depPath
@@ -275,8 +277,6 @@ func GetGPUInfo() GpuInfoList {
 					}
 				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-				gpuInfo.DriverMajor = driverMajor
-				gpuInfo.DriverMinor = driverMinor
 				gpuInfo.Variant = variant
 
 				// query the management library as well so we can record any skew between the two

From 0c819e167becd7f08312d2a1a1e2ac8e8ea5d4da Mon Sep 17 00:00:00 2001
From: Patrick Devine <pdevine@sonic.net>
Date: Fri, 23 Aug 2024 11:29:56 -0700
Subject: [PATCH 31/34] convert safetensor adapters into GGUF (#6327)

---
 cmd/cmd.go                        |   6 +
 convert/convert.go                | 111 +++++++++++--
 convert/convert_bert.go           |  18 +-
 convert/convert_gemma.go          |  18 +-
 convert/convert_gemma2.go         |  12 +-
 convert/convert_gemma2_adapter.go |  91 +++++++++++
 convert/convert_llama.go          |  16 +-
 convert/convert_llama_adapter.go  | 169 +++++++++++++++++++
 convert/convert_mixtral.go        |  16 +-
 convert/convert_phi3.go           |  14 +-
 convert/convert_test.go           | 262 +++++++++++++++++++++++++++---
 convert/reader.go                 |   2 +
 llm/ggml.go                       |   8 +
 server/images.go                  |  11 +-
 server/model.go                   |  38 ++++-
 server/model_test.go              |   6 +-
 16 files changed, 697 insertions(+), 101 deletions(-)
 create mode 100644 convert/convert_gemma2_adapter.go
 create mode 100644 convert/convert_llama_adapter.go

diff --git a/cmd/cmd.go b/cmd/cmd.go
index a8a02605..b75c0b5e 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -204,6 +204,12 @@ func tempZipFiles(path string) (string, error) {
 		// safetensors files might be unresolved git lfs references; skip if they are
 		// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
 		files = append(files, st...)
+	} else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 {
+		// covers adapters.safetensors
+		files = append(files, st...)
+	} else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 {
+		// covers adapter_model.safetensors
+		files = append(files, st...)
 	} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
 		// pytorch files might also be unresolved git lfs references; skip if they are
 		// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
diff --git a/convert/convert.go b/convert/convert.go
index 5a314cdd..8c7b0943 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -12,12 +12,22 @@ import (
 	"github.com/ollama/ollama/llm"
 )
 
-type Parameters struct {
+type ModelParameters struct {
 	Architectures []string `json:"architectures"`
 	VocabSize     uint32   `json:"vocab_size"`
 }
 
-func (Parameters) KV(t *Tokenizer) llm.KV {
+type AdapterParameters struct {
+	Alpha          uint32 `json:"lora_alpha"`
+	LoraLayers     uint32 `json:"lora_layers"`
+	LoraParameters struct {
+		Rank  uint32  `json:"rank"`
+		Alpha float32 `json:"alpha"`
+		Scale float32 `json:"scale"`
+	} `json:"lora_parameters"`
+}
+
+func (ModelParameters) KV(t *Tokenizer) llm.KV {
 	kv := llm.KV{
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
@@ -44,17 +54,40 @@ func (Parameters) KV(t *Tokenizer) llm.KV {
 	return kv
 }
 
-func (Parameters) specialTokenTypes() []string {
+func (p AdapterParameters) KV() llm.KV {
+	var alpha float32
+	if p.LoraParameters.Alpha == 0 {
+		alpha = float32(p.Alpha)
+	} else {
+		alpha = p.LoraParameters.Alpha
+	}
+
+	kv := llm.KV{
+		"adapter.lora.alpha": alpha,
+		"adapter.type":       "lora",
+		"general.file_type":  uint32(1),
+		"general.type":       "adapter",
+		"general.version":    "v0.2",
+	}
+
+	return kv
+}
+
+func (ModelParameters) specialTokenTypes() []string {
 	return []string{
 		"bos", "eos", "unk", "sep", "pad", "cls", "mask",
 	}
 }
 
-func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
+func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
 	return llm.WriteGGUF(ws, kv, ts)
 }
 
-type Converter interface {
+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
+	return llm.WriteGGUF(ws, kv, ts)
+}
+
+type ModelConverter interface {
 	// KV maps parameters to LLM key-values
 	KV(*Tokenizer) llm.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
@@ -73,17 +106,67 @@ type moreParser interface {
 	parseMore(fs.FS) error
 }
 
+type AdapterConverter interface {
+	// KV maps parameters to LLM key-values
+	KV(llm.KV) llm.KV
+	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
+	Tensors([]Tensor) []llm.Tensor
+	// Replacements returns a list of string pairs to replace in tensor names.
+	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
+	Replacements() []string
+
+	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+}
+
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
+	bts, err := fs.ReadFile(fsys, "adapter_config.json")
+	if err != nil {
+		return err
+	}
+
+	var p AdapterParameters
+	if err := json.Unmarshal(bts, &p); err != nil {
+		return err
+	}
+
+	arch, ok := baseKV["general.architecture"]
+	if !ok {
+		return errors.New("architecture not set for the base model")
+	}
+
+	var conv AdapterConverter
+	switch arch {
+	case "llama":
+		conv = &llamaAdapter{}
+	case "gemma2":
+		conv = &gemma2Adapter{}
+	default:
+		return errors.New("unsupported architecture")
+	}
+
+	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
+	if err != nil {
+		return err
+	}
+
+	if err := json.Unmarshal(bts, conv); err != nil {
+		return err
+	}
+
+	return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
+}
+
 // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
 // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func Convert(fsys fs.FS, ws io.WriteSeeker) error {
+func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
 		return err
 	}
 
-	var p Parameters
+	var p ModelParameters
 	if err := json.Unmarshal(bts, &p); err != nil {
 		return err
 	}
@@ -92,20 +175,20 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		return errors.New("unknown architecture")
 	}
 
-	var conv Converter
+	var conv ModelConverter
 	switch p.Architectures[0] {
 	case "LlamaForCausalLM", "MistralForCausalLM":
-		conv = &llama{}
+		conv = &llamaModel{}
 	case "MixtralForCausalLM":
-		conv = &mixtral{}
+		conv = &mixtralModel{}
 	case "GemmaForCausalLM":
-		conv = &gemma{}
+		conv = &gemmaModel{}
 	case "Gemma2ForCausalLM":
-		conv = &gemma2{}
+		conv = &gemma2Model{}
 	case "Phi3ForCausalLM":
-		conv = &phi3{}
+		conv = &phi3Model{}
 	case "BertModel":
-		conv = &bert{}
+		conv = &bertModel{}
 	default:
 		return errors.New("unsupported architecture")
 	}
diff --git a/convert/convert_bert.go b/convert/convert_bert.go
index 6e7d59fe..ea5facaa 100644
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -11,8 +11,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )
 
-type bert struct {
-	Parameters
+type bertModel struct {
+	ModelParameters
 	NLayers               uint32  `json:"n_layers"`
 	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
 	NLayer                uint32  `json:"n_layer"`
@@ -33,11 +33,11 @@ type bert struct {
 }
 
 var (
-	_ Converter  = (*bert)(nil)
-	_ moreParser = (*bert)(nil)
+	_ ModelConverter = (*bertModel)(nil)
+	_ moreParser     = (*bertModel)(nil)
 )
 
-func (p *bert) parseMore(fsys fs.FS) error {
+func (p *bertModel) parseMore(fsys fs.FS) error {
 	bts, err := fs.ReadFile(fsys, "modules.json")
 	if err != nil {
 		return err
@@ -85,8 +85,8 @@ func (p *bert) parseMore(fsys fs.FS) error {
 	return nil
 }
 
-func (p *bert) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *bertModel) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
 	kv["bert.pooling_type"] = p.PoolingType
@@ -132,7 +132,7 @@ func (p *bert) KV(t *Tokenizer) llm.KV {
 	return kv
 }
 
-func (p *bert) Tensors(ts []Tensor) []llm.Tensor {
+func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
 	for _, t := range ts {
 		if slices.Contains([]string{
@@ -154,7 +154,7 @@ func (p *bert) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }
 
-func (bert) Replacements() []string {
+func (bertModel) Replacements() []string {
 	return []string{
 		"encoder.layer", "blk",
 		"encoder.layers", "blk",
diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go
index c4316808..b8865294 100644
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -9,8 +9,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )
 
-type gemma struct {
-	Parameters
+type gemmaModel struct {
+	ModelParameters
 	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 	HiddenSize            uint32  `json:"hidden_size"`
 	HiddenLayers          uint32  `json:"num_hidden_layers"`
@@ -21,10 +21,10 @@ type gemma struct {
 	HeadDim               uint32  `json:"head_dim"`
 }
 
-var _ Converter = (*gemma)(nil)
+var _ ModelConverter = (*gemmaModel)(nil)
 
-func (p *gemma) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma.embedding_length"] = p.HiddenSize
@@ -42,8 +42,8 @@ func (p *gemma) KV(t *Tokenizer) llm.KV {
 	return kv
 }
 
-func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
-	out := make([]llm.Tensor, 0, len(ts))
+func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
 	for _, t := range ts {
 		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
@@ -60,7 +60,7 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }
 
-func (p *gemma) Replacements() []string {
+func (p *gemmaModel) Replacements() []string {
 	return []string{
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
@@ -77,7 +77,7 @@ func (p *gemma) Replacements() []string {
 	}
 }
 
-func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
+func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
 	n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
 	ones := tensor.Ones(tensor.Float32, int(shape[0]))
 
diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go
index 084f9c52..c4ee2d09 100644
--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -4,15 +4,15 @@ import (
 	"github.com/ollama/ollama/llm"
 )
 
-type gemma2 struct {
-	gemma
+type gemma2Model struct {
+	gemmaModel
 	SlidingWindow         uint32  `json:"sliding_window"`
 	AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 }
 
-func (p *gemma2) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma2.embedding_length"] = p.HiddenSize
@@ -33,9 +33,9 @@ func (p *gemma2) KV(t *Tokenizer) llm.KV {
 	return kv
 }
 
-func (p *gemma2) Replacements() []string {
+func (p *gemma2Model) Replacements() []string {
 	return append(
-		p.gemma.Replacements(),
+		p.gemmaModel.Replacements(),
 		"post_attention_layernorm", "post_attention_norm",
 		"pre_feedforward_layernorm", "ffn_norm",
 		"post_feedforward_layernorm", "post_ffw_norm",
diff --git a/convert/convert_gemma2_adapter.go b/convert/convert_gemma2_adapter.go
new file mode 100644
index 00000000..a89a25f4
--- /dev/null
+++ b/convert/convert_gemma2_adapter.go
@@ -0,0 +1,91 @@
+package convert
+
+import (
+	"strings"
+
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type gemma2Adapter struct {
+	AdapterParameters
+}
+
+var _ AdapterConverter = (*gemma2Adapter)(nil)
+
+func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
+	kv := p.AdapterParameters.KV()
+	kv["general.architecture"] = "gemma2"
+	return kv
+}
+
+func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
+	for _, t := range ts {
+		shape := t.Shape()
+		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
+			(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
+			shape[0], shape[1] = shape[1], shape[0]
+			t.SetRepacker(p.repack)
+		}
+
+		out = append(out, llm.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *gemma2Adapter) Replacements() []string {
+	return []string{
+		"base_model.model.", "",
+		"model.layers", "blk",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"lora_A.weight", "weight.lora_a",
+		"lora_B.weight", "weight.lora_b",
+		"lora_a", "weight.lora_a",
+		"lora_b", "weight.lora_b",
+	}
+}
+
+func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+	dims := []int{int(shape[1]), int(shape[0])}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+	if err := n.T(1, 0); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
diff --git a/convert/convert_llama.go b/convert/convert_llama.go
index 27f924fb..5dedb829 100644
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -12,8 +12,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )
 
-type llama struct {
-	Parameters
+type llamaModel struct {
+	ModelParameters
 	NLayers               uint32  `json:"n_layers"`
 	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
 	NLayer                uint32  `json:"n_layer"`
@@ -44,10 +44,10 @@ type llama struct {
 	HeadDim          uint32  `json:"head_dim"`
 }
 
-var _ Converter = (*llama)(nil)
+var _ ModelConverter = (*llamaModel)(nil)
 
-func (p *llama) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *llamaModel) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize
 
@@ -120,7 +120,7 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 	return kv
 }
 
-func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
+func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
 
 	if p.RopeScaling.factors != nil {
@@ -149,7 +149,7 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }
 
-func (p *llama) Replacements() []string {
+func (p *llamaModel) Replacements() []string {
 	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
@@ -167,7 +167,7 @@ func (p *llama) Replacements() []string {
 	}
 }
 
-func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
 	var dims []int
 	for _, dim := range shape {
 		dims = append(dims, int(dim))
diff --git a/convert/convert_llama_adapter.go b/convert/convert_llama_adapter.go
new file mode 100644
index 00000000..08ddee10
--- /dev/null
+++ b/convert/convert_llama_adapter.go
@@ -0,0 +1,169 @@
+package convert
+
+import (
+	"cmp"
+	"strings"
+
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type llamaAdapter struct {
+	AdapterParameters
+	NumAttentionHeads uint32 `json:"num_attention_heads"`
+	NumKeyValueHeads  uint32 `json:"num_key_value_heads"`
+}
+
+var _ AdapterConverter = (*llamaAdapter)(nil)
+
+func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
+	kv := p.AdapterParameters.KV()
+	kv["general.architecture"] = "llama"
+	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
+	kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
+
+	p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
+
+	return kv
+}
+
+func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
+	for _, t := range ts {
+		shape := t.Shape()
+		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
+			(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
+			shape[0], shape[1] = shape[1], shape[0]
+			t.SetRepacker(p.repackAndTranspose)
+		} else {
+			t.SetRepacker(p.repack)
+		}
+
+		out = append(out, llm.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    shape,
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *llamaAdapter) Replacements() []string {
+	return []string{
+		"base_model.model.", "",
+		"model.layers", "blk",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"lora_A.weight", "weight.lora_a",
+		"lora_B.weight", "weight.lora_b",
+		"lora_a", "weight.lora_a",
+		"lora_b", "weight.lora_b",
+	}
+}
+
+func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+	dims := []int{int(shape[1]), int(shape[0])}
+
+	var heads uint32
+	if strings.HasSuffix(name, "attn_q.weight.lora_a") {
+		heads = p.NumAttentionHeads
+	} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
+		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
+	} else {
+		return data, nil
+	}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+	if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+		return nil, err
+	}
+
+	if err := n.T(0, 2, 1, 3); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
+
+func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) {
+	dims := []int{int(shape[1]), int(shape[0])}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+	var heads uint32
+	if strings.HasSuffix(name, "attn_q.weight.lora_a") {
+		heads = p.NumAttentionHeads
+	} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
+		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
+	}
+
+	if heads > 0 {
+		if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+			return nil, err
+		}
+
+		if err := n.T(0, 2, 1, 3); err != nil {
+			return nil, err
+		}
+
+		if err := n.Reshape(dims...); err != nil {
+			return nil, err
+		}
+
+		if err := n.Transpose(); err != nil {
+			return nil, err
+		}
+	}
+
+	if err := n.T(1, 0); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go
index 97a86b30..43b7c8b1 100644
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -9,14 +9,14 @@ import (
 	"github.com/ollama/ollama/llm"
 )
 
-type mixtral struct {
-	llama
+type mixtralModel struct {
+	llamaModel
 	NumLocalExperts    uint32 `json:"num_local_experts"`
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }
 
-func (p *mixtral) KV(t *Tokenizer) llm.KV {
-	kv := p.llama.KV(t)
+func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
+	kv := p.llamaModel.KV(t)
 
 	if p.NumLocalExperts > 0 {
 		kv["llama.expert_count"] = p.NumLocalExperts
@@ -29,7 +29,7 @@ func (p *mixtral) KV(t *Tokenizer) llm.KV {
 	return kv
 }
 
-func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 	oldnew := []string{
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
@@ -67,12 +67,12 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
 		})
 	}
 
-	return append(out, p.llama.Tensors(ts)...)
+	return append(out, p.llamaModel.Tensors(ts)...)
 }
 
-func (p *mixtral) Replacements() []string {
+func (p *mixtralModel) Replacements() []string {
 	return append(
-		p.llama.Replacements(),
+		p.llamaModel.Replacements(),
 		"block_sparse_moe.gate", "ffn_gate_inp",
 	)
 }
diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go
index 64d3d012..3de0d404 100644
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -11,8 +11,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )
 
-type phi3 struct {
-	Parameters
+type phi3Model struct {
+	ModelParameters
 	NumHiddenLayers   uint32  `json:"num_hidden_layers"`
 	NLayers           uint32  `json:"n_layers"`
 	HiddenSize        uint32  `json:"hidden_size"`
@@ -35,10 +35,10 @@ type phi3 struct {
 	SlidingWindow                 uint32  `json:"sliding_window"`
 }
 
-var _ Converter = (*phi3)(nil)
+var _ ModelConverter = (*phi3Model)(nil)
 
-func (p *phi3) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *phi3Model) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
 	kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
@@ -68,7 +68,7 @@ func (p *phi3) KV(t *Tokenizer) llm.KV {
 	return kv
 }
 
-func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
 	var addRopeFactors sync.Once
 
 	out := make([]llm.Tensor, 0, len(ts)+2)
@@ -100,7 +100,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }
 
-func (p *phi3) Replacements() []string {
+func (p *phi3Model) Replacements() []string {
 	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
diff --git a/convert/convert_test.go b/convert/convert_test.go
index 64b7df3b..56b34f22 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -1,7 +1,9 @@
 package convert
 
 import (
+	"bytes"
 	"crypto/sha256"
+	"encoding/binary"
 	"encoding/hex"
 	"encoding/json"
 	"flag"
@@ -29,7 +31,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	}
 	defer f.Close()
 
-	if err := Convert(fsys, f); err != nil {
+	if err := ConvertModel(fsys, f); err != nil {
 		t.Fatal(err)
 	}
 
@@ -51,6 +53,34 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	return r, m.KV(), m.Tensors()
 }
 
+func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string {
+	actual := make(map[string]string)
+	for k, v := range kv {
+		if s, ok := v.(json.Marshaler); !ok {
+			actual[k] = fmt.Sprintf("%v", v)
+		} else {
+			bts, err := json.Marshal(s)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
+		}
+	}
+
+	for _, tensor := range tensors.Items {
+		sha256sum := sha256.New()
+		sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
+		if _, err := io.Copy(sha256sum, sr); err != nil {
+			t.Fatal(err)
+		}
+
+		actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
+	}
+
+	return actual
+}
+
 func TestMain(m *testing.M) {
 	var level slog.Level
 	flag.TextVar(&level, "level", slog.LevelInfo, "log level")
@@ -85,29 +115,7 @@ func TestConvertFull(t *testing.T) {
 			}
 
 			f, kv, tensors := convertFull(t, os.DirFS(p))
-			actual := make(map[string]string)
-			for k, v := range kv {
-				if s, ok := v.(json.Marshaler); !ok {
-					actual[k] = fmt.Sprintf("%v", v)
-				} else {
-					bts, err := json.Marshal(s)
-					if err != nil {
-						t.Fatal(err)
-					}
-
-					actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
-				}
-			}
-
-			for _, tensor := range tensors.Items {
-				sha256sum := sha256.New()
-				sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
-				if _, err := io.Copy(sha256sum, sr); err != nil {
-					t.Fatal(err)
-				}
-
-				actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
-			}
+			actual := generateResultsJSON(t, f, kv, tensors)
 
 			expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
 			if err != nil {
@@ -131,3 +139,209 @@ func TestConvertFull(t *testing.T) {
 		})
 	}
 }
+
+func TestConvertAdapter(t *testing.T) {
+	type AdapterCase struct {
+		Name     string
+		BaseKV   map[string]any
+		Expected map[string]string
+	}
+
+	cases := []AdapterCase{
+		{
+			Name: "discollama",
+			BaseKV: map[string]any{
+				"general.architecture":          "llama",
+				"llama.attention.head_count":    uint32(32),
+				"llama.attention.head_count_kv": uint32(8),
+			},
+			Expected: map[string]string{
+				"general.architecture":          "llama",
+				"general.file_type":             "1",
+				"general.parameter_count":       "106496",
+				"general.type":                  "adapter",
+				"general.version":               "v0.2",
+				"adapter.lora.alpha":            "16",
+				"adapter.type":                  "lora",
+				"llama.attention.head_count":    "32",
+				"llama.attention.head_count_kv": "8",
+				"blk.31.attn_q.weight.lora_a":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
+				"blk.31.attn_q.weight.lora_b":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
+				"blk.31.attn_v.weight.lora_a":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
+				"blk.31.attn_v.weight.lora_b":   "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857",
+			},
+		},
+	}
+
+	for _, c := range cases {
+		t.Run(c.Name, func(t *testing.T) {
+			t.Parallel()
+
+			f, err := os.CreateTemp(t.TempDir(), "f16")
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer f.Close()
+
+			tempDir := t.TempDir()
+			generateLoraTestData(t, tempDir)
+
+			if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
+				t.Fatal(err)
+			}
+
+			r, err := os.Open(f.Name())
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer r.Close()
+
+			m, _, err := llm.DecodeGGML(r, math.MaxInt)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if _, err := r.Seek(0, io.SeekStart); err != nil {
+				t.Fatal(err)
+			}
+
+			actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
+
+			keys := maps.Keys(c.Expected)
+			slices.Sort(keys)
+			for _, k := range keys {
+				if v, ok := actual[k]; !ok {
+					t.Errorf("missing %s", k)
+				} else if v != c.Expected[k] {
+					t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
+				}
+			}
+		})
+	}
+}
+
+func generateLoraTestData(t *testing.T, tempDir string) {
+	type tensorData struct {
+		Offsets []int  `json:"data_offsets"`
+		Type    string `json:"dtype"`
+		Shape   []int  `json:"shape"`
+	}
+	offset := 4096 * 8 * 4
+
+	td := map[string]*tensorData{"__metadata__": nil}
+	td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{
+		Offsets: []int{0, offset},
+		Type:    "F32",
+		Shape:   []int{4096, 8},
+	}
+	td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{
+		Offsets: []int{offset, offset * 2},
+		Type:    "F32",
+		Shape:   []int{8, 4096},
+	}
+	td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{
+		Offsets: []int{offset * 2, offset * 3},
+		Type:    "F32",
+		Shape:   []int{4096, 8},
+	}
+	td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{
+		Offsets: []int{offset * 3, offset*3 + 8*1024*4},
+		Type:    "F32",
+		Shape:   []int{8, 1024},
+	}
+
+	data, err := json.Marshal(td)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var buf bytes.Buffer
+
+	l := int64(len(data))
+	err = binary.Write(&buf, binary.LittleEndian, l)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = buf.Write(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// write some data for the tensors
+
+	ones := make([]float32, 4096*8)
+	for i := range ones {
+		ones[i] = float32(1)
+	}
+
+	for range 3 {
+		err = binary.Write(&buf, binary.LittleEndian, ones)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	ones = make([]float32, 1024*8)
+	for i := range ones {
+		ones[i] = float32(1)
+	}
+
+	err = binary.Write(&buf, binary.LittleEndian, ones)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer fdata.Close()
+
+	_, err = fdata.Write(buf.Bytes())
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	configData := `
+{
+    "adapter_path": "adapters-test",
+    "batch_size": 8,
+    "config": "config-tiny.json",
+    "data": "../discollama-completion",
+    "grad_checkpoint": null,
+    "iters": 1000,
+    "learning_rate": 1e-05,
+    "lora_layers": 1,
+    "lora_parameters": {
+        "rank": 8,
+        "alpha": 16,
+        "dropout": 0.0,
+        "scale": 2.0
+    },
+    "lr_schedule": null,
+    "max_seq_length": 2048,
+    "model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct",
+    "resume_adapter_file": null,
+    "save_every": 100,
+    "seed": 0,
+    "steps_per_eval": 200,
+    "steps_per_report": 10,
+    "test": false,
+    "test_batches": 500,
+    "train": true,
+    "use_dora": false,
+    "val_batches": 25
+}
+`
+	f, err := os.Create(filepath.Join(tempDir, "adapter_config.json"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	_, err = f.WriteString(configData)
+	if err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/convert/reader.go b/convert/reader.go
index 5bba0406..c1218e66 100644
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -64,6 +64,8 @@ func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
 	}{
 		{"model-*-of-*.safetensors", parseSafetensors},
 		{"model.safetensors", parseSafetensors},
+		{"adapters.safetensors", parseSafetensors},
+		{"adapter_model.safetensors", parseSafetensors},
 		{"pytorch_model-*-of-*.bin", parseTorch},
 		{"pytorch_model.bin", parseTorch},
 		{"consolidated.*.pth", parseTorch},
diff --git a/llm/ggml.go b/llm/ggml.go
index 4c68adf9..ab436095 100644
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -43,6 +43,14 @@ func (kv KV) Architecture() string {
 	return "unknown"
 }
 
+func (kv KV) Kind() string {
+	if s, ok := kv["general.type"].(string); ok {
+		return s
+	}
+
+	return "unknown"
+}
+
 func (kv KV) ParameterCount() uint64 {
 	return kv.u64("general.parameter_count")
 }
diff --git a/server/images.go b/server/images.go
index 8b3a67cf..b5bf7ad6 100644
--- a/server/images.go
+++ b/server/images.go
@@ -369,13 +369,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 	parameters := make(map[string]any)
 
 	var layers []Layer
+	var baseLayers []*layerGGML
 	for _, c := range modelfile.Commands {
 		mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
+		command := c.Name
 
-		switch c.Name {
+		switch command {
 		case "model", "adapter":
-			var baseLayers []*layerGGML
-			if name := model.ParseName(c.Args); name.IsValid() {
+			if name := model.ParseName(c.Args); name.IsValid() && command == "model" {
 				baseLayers, err = parseFromModel(ctx, name, fn)
 				if err != nil {
 					return err
@@ -409,14 +410,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 				}
 				defer blob.Close()
 
-				baseLayers, err = parseFromFile(ctx, blob, digest, fn)
+				baseLayers, err = parseFromFile(ctx, command, baseLayers, blob, digest, fn)
 				if err != nil {
 					return err
 				}
 			} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
 				defer file.Close()
 
-				baseLayers, err = parseFromFile(ctx, file, "", fn)
+				baseLayers, err = parseFromFile(ctx, command, baseLayers, file, "", fn)
 				if err != nil {
 					return err
 				}
diff --git a/server/model.go b/server/model.go
index b17bf0e3..55fb2d8d 100644
--- a/server/model.go
+++ b/server/model.go
@@ -81,7 +81,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 	return layers, nil
 }
 
-func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 	fi, err := f.Stat()
 	if err != nil {
 		return nil, err
@@ -108,16 +108,38 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.
 	defer t.Close()
 	defer os.Remove(t.Name())
 
-	fn(api.ProgressResponse{Status: "converting model"})
-	if err := convert.Convert(convert.NewZipReader(r, p, 32<<20), t); err != nil {
-		return nil, err
+	var layerType string
+
+	switch command {
+	case "adapter":
+		var baseModel *llm.GGML
+		for _, l := range baseLayers {
+			if l.GGML != nil {
+				baseModel = l.GGML
+				break
+			}
+		}
+
+		if baseModel == nil {
+			return nil, fmt.Errorf("no base model specified for the adapter")
+		}
+
+		if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil {
+			return nil, err
+		}
+		layerType = "application/vnd.ollama.image.adapter"
+	case "model":
+		if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil {
+			return nil, err
+		}
+		layerType = "application/vnd.ollama.image.model"
 	}
 
 	if _, err := t.Seek(0, io.SeekStart); err != nil {
 		return nil, err
 	}
 
-	layer, err := NewLayer(t, "application/vnd.ollama.image.model")
+	layer, err := NewLayer(t, layerType)
 	if err != nil {
 		return nil, err
 	}
@@ -139,7 +161,7 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.
 	return detectChatTemplate(layers)
 }
 
-func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 	sr := io.NewSectionReader(file, 0, 512)
 	contentType, err := detectContentType(sr)
 	if err != nil {
@@ -150,7 +172,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 	case "gguf", "ggla":
 		// noop
 	case "application/zip":
-		return parseFromZipFile(ctx, file, digest, fn)
+		return parseFromZipFile(ctx, command, baseLayers, file, digest, fn)
 	default:
 		return nil, fmt.Errorf("unsupported content type: %s", contentType)
 	}
@@ -170,7 +192,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 		}
 
 		mediatype := "application/vnd.ollama.image.model"
-		if ggml.Name() == "ggla" {
+		if ggml.Name() == "ggla" || ggml.KV().Kind() == "adapter" {
 			mediatype = "application/vnd.ollama.image.adapter"
 		} else if ggml.KV().Architecture() == "clip" {
 			mediatype = "application/vnd.ollama.image.projector"
diff --git a/server/model_test.go b/server/model_test.go
index 63fc408d..7753c549 100644
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -153,7 +153,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
 		t.Fatalf("failed to seek to start: %v", err)
 	}
 
-	layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {})
+	layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, "", func(api.ProgressResponse) {})
 	if err != nil {
 		t.Fatalf("failed to parse from file: %v", err)
 	}
@@ -166,7 +166,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
 		t.Fatalf("failed to seek to start: %v", err)
 	}
 
-	layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {})
+	layers2, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, layers[0].Digest, func(api.ProgressResponse) {})
 	if err != nil {
 		t.Fatalf("failed to parse from file: %v", err)
 	}
@@ -206,7 +206,7 @@ func TestParseLayerFromCopy(t *testing.T) {
 		t.Fatalf("failed to seek to start: %v", err)
 	}
 
-	layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {})
+	layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file2, "", func(api.ProgressResponse) {})
 	if err != nil {
 		t.Fatalf("failed to parse from file: %v", err)
 	}

From bb362caf88487a08cb29c8263d383a7d9e448803 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 2 Jul 2024 15:02:07 -0700
Subject: [PATCH 32/34] update faq

---
 docs/faq.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/faq.md b/docs/faq.md
index 324116d1..25b68248 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -111,7 +111,10 @@ On Windows, Ollama inherits your user and system environment variables.
 
 ## How do I use Ollama behind a proxy?
 
-Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
+Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
+
+> [!NOTE]
+> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
 
 ### How do I use Ollama behind a proxy in Docker?
 
@@ -276,4 +279,4 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit
 
 ## How does Ollama load models on multiple GPUs?
 
-Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
\ No newline at end of file
+Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.

From 69be940bf6d2816f61c79facfa336183bc882720 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 23 Aug 2024 15:11:56 -0700
Subject: [PATCH 33/34] gpu: Group GPU Library sets by variant (#6483)

The recent cuda variant changes uncovered a bug in ByLibrary
which failed to group by common variant for GPU types.
---
 gpu/gpu_test.go | 25 +++++++++++++++++++++++++
 gpu/types.go    |  2 +-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/gpu/gpu_test.go b/gpu/gpu_test.go
index 46d3201e..13a3f544 100644
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@@ -32,4 +32,29 @@ func TestCPUMemInfo(t *testing.T) {
 	}
 }
 
+func TestByLibrary(t *testing.T) {
+	type testCase struct {
+		input  []GpuInfo
+		expect int
+	}
+
+	testCases := map[string]*testCase{
+		"empty":                    {input: []GpuInfo{}, expect: 0},
+		"cpu":                      {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
+		"cpu + GPU":                {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
+		"cpu + 2 GPU no variant":   {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
+		"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
+		"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
+	}
+
+	for k, v := range testCases {
+		t.Run(k, func(t *testing.T) {
+			resp := (GpuInfoList)(v.input).ByLibrary()
+			if len(resp) != v.expect {
+				t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
+			}
+		})
+	}
+}
+
 // TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
diff --git a/gpu/types.go b/gpu/types.go
index 4cbbeb84..a30e5fb3 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -94,7 +94,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 			}
 		}
 		if !found {
-			libs = append(libs, info.Library)
+			libs = append(libs, requested)
 			resp = append(resp, []GpuInfo{info})
 		}
 	}

From 0f92b19bec97198b035a7801eda14e3d48149033 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Sat, 24 Aug 2024 17:24:50 -0700
Subject: [PATCH 34/34] Only enable numa on CPUs (#6484)

The numa flag may be having a performance impact on multi-socket systems with GPU loads
---
 llm/server.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/server.go b/llm/server.go
index 9347a458..4e5dac28 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -258,7 +258,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--mlock")
 	}
 
-	if gpu.IsNUMA() {
+	if gpu.IsNUMA() && gpus[0].Library == "cpu" {
 		numaMode := "distribute"
 		if runtime.GOOS == "linux" {
 			if _, err := exec.LookPath("numactl"); err == nil {