Merge https://github.com/ollama/ollama

Signed-off-by: baalajimaestro <me@baalajimaestro.me>
Only enable numa on CPUs (#6484 )
2024-08-25 22:02:07 +05:30 · 2024-08-24 17:24:50 -07:00 · 2024-08-23 15:11:56 -07:00 · 2024-08-23 14:05:59 -07:00 · 2024-08-23 13:37:21 -07:00 · 2024-08-23 11:29:56 -07:00
59 changed files with 1647 additions and 471 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -187,6 +187,13 @@ jobs:
  generate-windows-cuda:
    environment: release
    runs-on: windows
+    strategy:
+      matrix:
+        cuda:
+          - version: "11"
+            url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
+          - version: "12"
+            url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
    env:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
@ -220,11 +227,11 @@ jobs:
        with:
          go-version-file: go.mod
          cache: true
-      - name: 'Install CUDA'
+      - name: 'Install CUDA ${{ matrix.cuda.version }}'
        run: |
          $ErrorActionPreference = "Stop"
          write-host "downloading CUDA Installer"
-          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
+          Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
          write-host "Installing CUDA"
          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
          write-host "Completed CUDA"
@ -256,15 +263,16 @@ jobs:
          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
      - uses: actions/upload-artifact@v4
        with:
-          name: generate-windows-cuda
+          name: generate-windows-cuda-${{ matrix.cuda.version }}
          path: |
            llm/build/**/bin/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
-          name: windows-cuda-deps
+          name: windows-cuda-deps-${{ matrix.cuda.version }}
          path: dist/deps/*

+
  # Import the prior generation steps and build the final windows assets
  build-windows:
    environment: release
@ -314,10 +322,16 @@ jobs:
          name: generate-windows-cpu
      - uses: actions/download-artifact@v4
        with:
-          name: generate-windows-cuda
+          name: generate-windows-cuda-11
      - uses: actions/download-artifact@v4
        with:
-          name: windows-cuda-deps
+          name: generate-windows-cuda-12
+      - uses: actions/download-artifact@v4
+        with:
+          name: windows-cuda-deps-11
+      - uses: actions/download-artifact@v4
+        with:
+          name: windows-cuda-deps-12
      - uses: actions/download-artifact@v4
        with:
          name: windows-rocm-deps
@ -363,7 +377,6 @@ jobs:
      - run: |
          ./scripts/build_linux.sh
          ./scripts/build_docker.sh
-          mv dist/deps/* dist/
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-amd64
@ -459,7 +472,10 @@ jobs:
          merge-multiple: true
      - run: |
          ls -lh dist/
-          (cd dist; sha256sum * > sha256sum.txt)
+          (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
+          mv sha256sum.txt dist/
+          mv dist/linux-???64 .
+          mv dist/linux-amd64-rocm .
          cat dist/sha256sum.txt
      - name: Create or update Release
        run: |
--- a/app/ollama.iss
+++ b/app/ollama.iss
@ -87,20 +87,11 @@ DialogFontSize=12

 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
-Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
+Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit
+Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
-#if DirExists("..\dist\windows-amd64\cuda")
-  Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
-#endif
-#if DirExists("..\dist\windows-amd64\oneapi")
-  Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
-#endif
-#if DirExists("..\dist\windows-amd64\rocm")
-  Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
-#endif
-
+Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs

 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
@ -108,7 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
 Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"

 [Run]
-Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
+Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden

 [UninstallRun]
 ; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
@ -143,8 +134,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi

 [Registry]
 Root: HKCU; Subkey: "Environment"; \
-    ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
-    Check: NeedsAddPath('{app}')
+    ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \
+    Check: NeedsAddPath('{app}\bin')

 [Code]

--- a/app/tray/wintray/tray.go
+++ b/app/tray/wintray/tray.go
@ -11,6 +11,7 @@ import (
 	"path/filepath"
 	"sort"
 	"sync"
+	"syscall"
 	"unsafe"

 	"golang.org/x/sys/windows"
@ -433,7 +434,12 @@ func (t *winTray) setIcon(src string) error {
 	t.muNID.Lock()
 	defer t.muNID.Unlock()
 	t.nid.Icon = h
-	t.nid.Flags |= NIF_ICON
+	t.nid.Flags |= NIF_ICON | NIF_TIP
+	if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil {
+		copy(t.nid.Tip[:], toolTipUTF16)
+	} else {
+		return err
+	}
 	t.nid.Size = uint32(unsafe.Sizeof(*t.nid))

 	return t.nid.modify()
--- a/app/tray/wintray/w32api.go
+++ b/app/tray/wintray/w32api.go
@ -61,6 +61,7 @@ const (
 	MIIM_SUBMENU        = 0x00000004
 	MIM_APPLYTOSUBMENUS = 0x80000000
 	NIF_ICON            = 0x00000002
+	NIF_TIP             = 0x00000004
 	NIF_INFO            = 0x00000010
 	NIF_MESSAGE         = 0x00000001
 	SW_HIDE             = 0
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -204,6 +204,12 @@ func tempZipFiles(path string) (string, error) {
 		// safetensors files might be unresolved git lfs references; skip if they are
 		// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
 		files = append(files, st...)
+	} else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 {
+		// covers adapters.safetensors
+		files = append(files, st...)
+	} else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 {
+		// covers adapter_model.safetensors
+		files = append(files, st...)
 	} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
 		// pytorch files might also be unresolved git lfs references; skip if they are
 		// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
@ -223,6 +229,14 @@ func tempZipFiles(path string) (string, error) {
 	}
 	files = append(files, js...)

+	// bert models require a nested config.json
+	// TODO(mxyng): merge this with the glob above
+	js, err = glob(filepath.Join(path, "**/*.json"), "text/plain")
+	if err != nil {
+		return "", err
+	}
+	files = append(files, js...)
+
 	if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
 		// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
 		// tokenizer.model might be a unresolved git lfs reference; error if it is
@ -252,6 +266,11 @@ func tempZipFiles(path string) (string, error) {
 			return "", err
 		}

+		zfi.Name, err = filepath.Rel(path, file)
+		if err != nil {
+			return "", err
+		}
+
 		zf, err := zipfile.CreateHeader(zfi)
 		if err != nil {
 			return "", err
--- a/convert/convert.go
+++ b/convert/convert.go
@ -7,16 +7,27 @@ import (
 	"io"
 	"io/fs"
 	"log/slog"
+	"strings"

 	"github.com/ollama/ollama/llm"
 )

-type Parameters struct {
+type ModelParameters struct {
 	Architectures []string `json:"architectures"`
 	VocabSize     uint32   `json:"vocab_size"`
 }

-func (Parameters) KV(t *Tokenizer) llm.KV {
+type AdapterParameters struct {
+	Alpha          uint32 `json:"lora_alpha"`
+	LoraLayers     uint32 `json:"lora_layers"`
+	LoraParameters struct {
+		Rank  uint32  `json:"rank"`
+		Alpha float32 `json:"alpha"`
+		Scale float32 `json:"scale"`
+	} `json:"lora_parameters"`
+}
+
+func (ModelParameters) KV(t *Tokenizer) llm.KV {
 	kv := llm.KV{
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
@ -43,40 +54,119 @@ func (Parameters) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (Parameters) specialTokenTypes() []string {
+func (p AdapterParameters) KV() llm.KV {
+	var alpha float32
+	if p.LoraParameters.Alpha == 0 {
+		alpha = float32(p.Alpha)
+	} else {
+		alpha = p.LoraParameters.Alpha
+	}
+
+	kv := llm.KV{
+		"adapter.lora.alpha": alpha,
+		"adapter.type":       "lora",
+		"general.file_type":  uint32(1),
+		"general.type":       "adapter",
+		"general.version":    "v0.2",
+	}
+
+	return kv
+}
+
+func (ModelParameters) specialTokenTypes() []string {
 	return []string{
 		"bos", "eos", "unk", "sep", "pad", "cls", "mask",
 	}
 }

-func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
+func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
 	return llm.WriteGGUF(ws, kv, ts)
 }

-type Converter interface {
+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
+	return llm.WriteGGUF(ws, kv, ts)
+}
+
+type ModelConverter interface {
 	// KV maps parameters to LLM key-values
 	KV(*Tokenizer) llm.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
 	Tensors([]Tensor) []llm.Tensor
+	// Replacements returns a list of string pairs to replace in tensor names.
+	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
+	Replacements() []string

-	// tensorName returns the LLM tensor name for a specific input name
-	tensorName(string) string
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
+	// writeFile writes the model to the provided io.WriteSeeker
 	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
 }

+type moreParser interface {
+	parseMore(fs.FS) error
+}
+
+type AdapterConverter interface {
+	// KV maps parameters to LLM key-values
+	KV(llm.KV) llm.KV
+	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
+	Tensors([]Tensor) []llm.Tensor
+	// Replacements returns a list of string pairs to replace in tensor names.
+	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
+	Replacements() []string
+
+	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+}
+
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
+	bts, err := fs.ReadFile(fsys, "adapter_config.json")
+	if err != nil {
+		return err
+	}
+
+	var p AdapterParameters
+	if err := json.Unmarshal(bts, &p); err != nil {
+		return err
+	}
+
+	arch, ok := baseKV["general.architecture"]
+	if !ok {
+		return errors.New("architecture not set for the base model")
+	}
+
+	var conv AdapterConverter
+	switch arch {
+	case "llama":
+		conv = &llamaAdapter{}
+	case "gemma2":
+		conv = &gemma2Adapter{}
+	default:
+		return errors.New("unsupported architecture")
+	}
+
+	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
+	if err != nil {
+		return err
+	}
+
+	if err := json.Unmarshal(bts, conv); err != nil {
+		return err
+	}
+
+	return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
+}
+
 // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
 // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func Convert(fsys fs.FS, ws io.WriteSeeker) error {
+func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
 		return err
 	}

-	var p Parameters
+	var p ModelParameters
 	if err := json.Unmarshal(bts, &p); err != nil {
 		return err
 	}
@ -85,16 +175,20 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		return errors.New("unknown architecture")
 	}

-	var conv Converter
+	var conv ModelConverter
 	switch p.Architectures[0] {
 	case "LlamaForCausalLM", "MistralForCausalLM":
-		conv = &llama{}
+		conv = &llamaModel{}
 	case "MixtralForCausalLM":
-		conv = &mixtral{}
+		conv = &mixtralModel{}
 	case "GemmaForCausalLM":
-		conv = &gemma{}
+		conv = &gemmaModel{}
+	case "Gemma2ForCausalLM":
+		conv = &gemma2Model{}
 	case "Phi3ForCausalLM":
-		conv = &phi3{}
+		conv = &phi3Model{}
+	case "BertModel":
+		conv = &bertModel{}
 	default:
 		return errors.New("unsupported architecture")
 	}
@ -103,6 +197,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		return err
 	}

+	if t, ok := conv.(moreParser); ok {
+		if err := t.parseMore(fsys); err != nil {
+			return err
+		}
+	}
+
 	t, err := parseTokenizer(fsys, conv.specialTokenTypes())
 	if err != nil {
 		return err
@ -119,7 +219,7 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}

-	ts, err := parseTensors(fsys)
+	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
 	if err != nil {
 		return err
 	}
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@ -0,0 +1,174 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"io/fs"
+	"path/filepath"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type bertModel struct {
+	ModelParameters
+	NLayers               uint32  `json:"n_layers"`
+	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
+	NLayer                uint32  `json:"n_layer"`
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	NCtx                  uint32  `json:"n_ctx"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	NEmbd                 uint32  `json:"n_embd"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	NInner                uint32  `json:"n_inner"`
+	NumAttentionHeads     uint32  `json:"num_attention_heads"`
+	NHead                 uint32  `json:"n_head"`
+	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
+	LayerNormEPS          float32 `json:"layer_norm_eps"`
+	LayerNormEpsilon      float32 `json:"layer_norm_epsilon"`
+	NormEpsilon           float32 `json:"norm_epsilon"`
+
+	PoolingType uint32
+}
+
+var (
+	_ ModelConverter = (*bertModel)(nil)
+	_ moreParser     = (*bertModel)(nil)
+)
+
+func (p *bertModel) parseMore(fsys fs.FS) error {
+	bts, err := fs.ReadFile(fsys, "modules.json")
+	if err != nil {
+		return err
+	}
+
+	var modules []struct {
+		Type string `json:"type"`
+		Path string `json:"path"`
+	}
+
+	if err := json.Unmarshal(bts, &modules); err != nil {
+		return err
+	}
+
+	var pooling string
+	for _, m := range modules {
+		if m.Type == "sentence_transformers.models.Pooling" {
+			pooling = m.Path
+			break
+		}
+	}
+
+	if pooling != "" {
+		bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
+		if err != nil {
+			return err
+		}
+
+		var pc struct {
+			PoolingModeCLSToken   bool `json:"pooling_mode_cls_token"`
+			PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
+		}
+
+		if err := json.Unmarshal(bts, &pc); err != nil {
+			return err
+		}
+
+		if pc.PoolingModeMeanTokens {
+			p.PoolingType = 1
+		} else if pc.PoolingModeCLSToken {
+			p.PoolingType = 2
+		}
+	}
+
+	return nil
+}
+
+func (p *bertModel) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
+	kv["general.architecture"] = "bert"
+	kv["bert.attention.causal"] = false
+	kv["bert.pooling_type"] = p.PoolingType
+
+	kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
+
+	if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
+		kv["bert.context_length"] = contextLength
+	}
+
+	if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
+		kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
+	}
+
+	if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
+		kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
+	}
+
+	if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 {
+		kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
+	}
+
+	if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
+		kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon
+	}
+
+	kv["tokenizer.ggml.model"] = "bert"
+	kv["tokenizer.ggml.token_type_count"] = uint32(2)
+
+	// convert to phantom space tokens
+	for i, e := range t.Tokens {
+		if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") {
+			// noop
+		} else if strings.HasPrefix(e, "##") {
+			t.Tokens[i] = e[2:]
+		} else {
+			t.Tokens[i] = "\u2581" + e
+		}
+	}
+
+	kv["tokenizer.ggml.tokens"] = t.Tokens
+
+	return kv
+}
+
+func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
+	for _, t := range ts {
+		if slices.Contains([]string{
+			"embeddings.position_ids",
+			"pooler.dense.weight",
+			"pooler.dense.bias",
+		}, t.Name()) {
+			continue
+		}
+
+		out = append(out, llm.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (bertModel) Replacements() []string {
+	return []string{
+		"encoder.layer", "blk",
+		"encoder.layers", "blk",
+		"embeddings.word_embeddings", "token_embd",
+		"embeddings.token_type_embeddings", "token_types",
+		"embeddings.LayerNorm", "token_embd_norm",
+		"embeddings.position_embeddings", "position_embd",
+		"attention.self.query", "attn_q",
+		"attention.self.key", "attn_k",
+		"attention.self.value", "attn_v",
+		"attention.output.dense", "attn_output",
+		"attention.output.LayerNorm", "attn_output_norm",
+		"intermediate.dense", "ffn_up",
+		"output.dense", "ffn_down",
+		"output.LayerNorm", "layer_output_norm",
+	}
+}
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@ -9,8 +9,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type gemma struct {
-	Parameters
+type gemmaModel struct {
+	ModelParameters
 	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 	HiddenSize            uint32  `json:"hidden_size"`
 	HiddenLayers          uint32  `json:"num_hidden_layers"`
@ -21,12 +21,11 @@ type gemma struct {
 	HeadDim               uint32  `json:"head_dim"`
 }

-var _ Converter = (*gemma)(nil)
+var _ ModelConverter = (*gemmaModel)(nil)

-func (p *gemma) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
-	kv["general.name"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma.embedding_length"] = p.HiddenSize
 	kv["gemma.block_count"] = p.HiddenLayers
@ -43,16 +42,15 @@ func (p *gemma) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
+func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
 	for _, t := range ts {
-		name := p.tensorName(t.Name())
-		if strings.HasSuffix(name, "_norm.weight") {
+		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}

 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@ -62,8 +60,8 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }

-func (p *gemma) tensorName(n string) string {
-	return strings.NewReplacer(
+func (p *gemmaModel) Replacements() []string {
+	return []string{
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
 		"model.layers", "blk",
@ -76,11 +74,10 @@ func (p *gemma) tensorName(n string) string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-		"block_sparse_moe.gate", "ffn_inp",
-	).Replace(n)
+	}
 }

-func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
+func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
 	n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
 	ones := tensor.Ones(tensor.Float32, int(shape[0]))

--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@ -0,0 +1,43 @@
+package convert
+
+import (
+	"github.com/ollama/ollama/llm"
+)
+
+type gemma2Model struct {
+	gemmaModel
+	SlidingWindow         uint32  `json:"sliding_window"`
+	AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
+	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
+}
+
+func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
+	kv["general.architecture"] = "gemma2"
+	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
+	kv["gemma2.embedding_length"] = p.HiddenSize
+	kv["gemma2.block_count"] = p.HiddenLayers
+	kv["gemma2.feed_forward_length"] = p.IntermediateSize
+	kv["gemma2.attention.head_count"] = p.NumAttentionHeads
+	kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads
+	kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+	kv["gemma2.attention.key_length"] = p.HeadDim
+	kv["gemma2.attention.value_length"] = p.HeadDim
+	kv["gemma2.attention.sliding_window"] = p.SlidingWindow
+	kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap
+	kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap
+	kv["tokenizer.ggml.eot_token_id"] = uint32(107)
+	kv["tokenizer.ggml.middle_token_id"] = uint32(68)
+	kv["tokenizer.ggml.prefix_token_id"] = uint32(67)
+	kv["tokenizer.ggml.suffix_token_id"] = uint32(69)
+	return kv
+}
+
+func (p *gemma2Model) Replacements() []string {
+	return append(
+		p.gemmaModel.Replacements(),
+		"post_attention_layernorm", "post_attention_norm",
+		"pre_feedforward_layernorm", "ffn_norm",
+		"post_feedforward_layernorm", "post_ffw_norm",
+	)
+}
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@ -0,0 +1,91 @@
+package convert
+
+import (
+	"strings"
+
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type gemma2Adapter struct {
+	AdapterParameters
+}
+
+var _ AdapterConverter = (*gemma2Adapter)(nil)
+
+func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
+	kv := p.AdapterParameters.KV()
+	kv["general.architecture"] = "gemma2"
+	return kv
+}
+
+func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
+	for _, t := range ts {
+		shape := t.Shape()
+		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
+			(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
+			shape[0], shape[1] = shape[1], shape[0]
+			t.SetRepacker(p.repack)
+		}
+
+		out = append(out, llm.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *gemma2Adapter) Replacements() []string {
+	return []string{
+		"base_model.model.", "",
+		"model.layers", "blk",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"lora_A.weight", "weight.lora_a",
+		"lora_B.weight", "weight.lora_b",
+		"lora_a", "weight.lora_a",
+		"lora_b", "weight.lora_b",
+	}
+}
+
+func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+	dims := []int{int(shape[1]), int(shape[0])}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+	if err := n.T(1, 0); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@ -3,6 +3,7 @@ package convert
 import (
 	"cmp"
 	"fmt"
+	"math"
 	"strings"

 	"github.com/pdevine/tensor"
@ -11,8 +12,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type llama struct {
-	Parameters
+type llamaModel struct {
+	ModelParameters
 	NLayers               uint32  `json:"n_layers"`
 	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
 	NLayer                uint32  `json:"n_layer"`
@ -28,7 +29,13 @@ type llama struct {
 	RopeTheta             float32 `json:"rope_theta"`
 	RopeScaling           struct {
 		Type                            string  `json:"type"`
+		RopeType                        string  `json:"rope_type"`
 		Factor                          float32 `json:"factor"`
+		LowFrequencyFactor              float32 `json:"low_freq_factor"`
+		HighFrequencyFactor             float32 `json:"high_freq_factor"`
+		OriginalMaxPositionalEmbeddings uint32  `json:"original_max_positional_embeddings"`
+
+		factors ropeFactor
 	} `json:"rope_scaling"`
 	RMSNormEPS       float32 `json:"rms_norm_eps"`
 	LayerNormEPS     float32 `json:"layer_norm_eps"`
@ -37,12 +44,11 @@ type llama struct {
 	HeadDim          uint32  `json:"head_dim"`
 }

-var _ Converter = (*llama)(nil)
+var _ ModelConverter = (*llamaModel)(nil)

-func (p *llama) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *llamaModel) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
-	kv["general.name"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize

 	kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
@ -71,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 	if p.RopeScaling.Type == "linear" {
 		kv["llama.rope.scaling.type"] = p.RopeScaling.Type
 		kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
+	} else if p.RopeScaling.RopeType == "llama3" {
+		dim := p.HiddenSize / p.NumAttentionHeads
+		for i := uint32(0); i < dim; i += 2 {
+			factor := cmp.Or(p.RopeScaling.Factor, 8.0)
+			factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
+			factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
+
+			original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
+			lambdaLow := float32(original) / factorLow
+			lambdaHigh := float32(original) / factorHigh
+
+			lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim))
+			if lambda < float64(lambdaHigh) {
+				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
+			} else if lambda > float64(lambdaLow) {
+				p.RopeScaling.factors = append(p.RopeScaling.factors, factor)
+			} else {
+				smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow)
+				p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth))
+			}
+		}
 	}

 	if p.NumKeyValueHeads > 0 {
@ -93,17 +120,26 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
+func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
 	var out []llm.Tensor
+
+	if p.RopeScaling.factors != nil {
+		out = append(out, llm.Tensor{
+			Name:     "rope_freqs.weight",
+			Kind:     0,
+			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
+			WriterTo: p.RopeScaling.factors,
+		})
+	}
+
 	for _, t := range ts {
-		name := p.tensorName(t.Name())
-		if strings.HasSuffix(name, "attn_q.weight") ||
-			strings.HasSuffix(name, "attn_k.weight") {
+		if strings.HasSuffix(t.Name(), "attn_q.weight") ||
+			strings.HasSuffix(t.Name(), "attn_k.weight") {
 			t.SetRepacker(p.repack)
 		}

 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@ -113,8 +149,8 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }

-func (p *llama) tensorName(n string) string {
-	return strings.NewReplacer(
+func (p *llamaModel) Replacements() []string {
+	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
@ -128,21 +164,19 @@ func (p *llama) tensorName(n string) string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-		// mixtral
-		"block_sparse_moe.gate", "ffn_gate_inp",
-	).Replace(n)
+	}
 }

-func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
 	var dims []int
 	for _, dim := range shape {
 		dims = append(dims, int(dim))
 	}

 	var heads uint32
-	if strings.HasSuffix(name, "q_proj.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "k_proj.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@ -0,0 +1,169 @@
+package convert
+
+import (
+	"cmp"
+	"strings"
+
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/llm"
+)
+
+type llamaAdapter struct {
+	AdapterParameters
+	NumAttentionHeads uint32 `json:"num_attention_heads"`
+	NumKeyValueHeads  uint32 `json:"num_key_value_heads"`
+}
+
+var _ AdapterConverter = (*llamaAdapter)(nil)
+
+func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
+	kv := p.AdapterParameters.KV()
+	kv["general.architecture"] = "llama"
+	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
+	kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
+
+	p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
+
+	return kv
+}
+
+func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
+	var out []llm.Tensor
+	for _, t := range ts {
+		shape := t.Shape()
+		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
+			(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
+			shape[0], shape[1] = shape[1], shape[0]
+			t.SetRepacker(p.repackAndTranspose)
+		} else {
+			t.SetRepacker(p.repack)
+		}
+
+		out = append(out, llm.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    shape,
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *llamaAdapter) Replacements() []string {
+	return []string{
+		"base_model.model.", "",
+		"model.layers", "blk",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"lora_A.weight", "weight.lora_a",
+		"lora_B.weight", "weight.lora_b",
+		"lora_a", "weight.lora_a",
+		"lora_b", "weight.lora_b",
+	}
+}
+
+func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
+	dims := []int{int(shape[1]), int(shape[0])}
+
+	var heads uint32
+	if strings.HasSuffix(name, "attn_q.weight.lora_a") {
+		heads = p.NumAttentionHeads
+	} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
+		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
+	} else {
+		return data, nil
+	}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+	if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+		return nil, err
+	}
+
+	if err := n.T(0, 2, 1, 3); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
+
+func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) {
+	dims := []int{int(shape[1]), int(shape[0])}
+
+	n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+	var heads uint32
+	if strings.HasSuffix(name, "attn_q.weight.lora_a") {
+		heads = p.NumAttentionHeads
+	} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
+		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
+	}
+
+	if heads > 0 {
+		if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+			return nil, err
+		}
+
+		if err := n.T(0, 2, 1, 3); err != nil {
+			return nil, err
+		}
+
+		if err := n.Reshape(dims...); err != nil {
+			return nil, err
+		}
+
+		if err := n.Transpose(); err != nil {
+			return nil, err
+		}
+	}
+
+	if err := n.T(1, 0); err != nil {
+		return nil, err
+	}
+
+	if err := n.Reshape(dims...); err != nil {
+		return nil, err
+	}
+
+	if err := n.Transpose(); err != nil {
+		return nil, err
+	}
+
+	ts, err := native.SelectF32(n, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	var f32s []float32
+	for _, t := range ts {
+		f32s = append(f32s, t...)
+	}
+
+	return f32s, nil
+}
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@ -9,16 +9,14 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type mixtral struct {
-	llama
+type mixtralModel struct {
+	llamaModel
 	NumLocalExperts    uint32 `json:"num_local_experts"`
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }

-var _ Converter = (*mixtral)(nil)
-
-func (p *mixtral) KV(t *Tokenizer) llm.KV {
-	kv := p.llama.KV(t)
+func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
+	kv := p.llamaModel.KV(t)

 	if p.NumLocalExperts > 0 {
 		kv["llama.expert_count"] = p.NumLocalExperts
@ -31,7 +29,7 @@ func (p *mixtral) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 	oldnew := []string{
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
@ -69,7 +67,14 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
 		})
 	}

-	return append(out, p.llama.Tensors(ts)...)
+	return append(out, p.llamaModel.Tensors(ts)...)
+}
+
+func (p *mixtralModel) Replacements() []string {
+	return append(
+		p.llamaModel.Replacements(),
+		"block_sparse_moe.gate", "ffn_gate_inp",
+	)
 }

 type experts []Tensor
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@ -11,8 +11,8 @@ import (
 	"github.com/ollama/ollama/llm"
 )

-type phi3 struct {
-	Parameters
+type phi3Model struct {
+	ModelParameters
 	NumHiddenLayers   uint32  `json:"num_hidden_layers"`
 	NLayers           uint32  `json:"n_layers"`
 	HiddenSize        uint32  `json:"hidden_size"`
@ -35,12 +35,11 @@ type phi3 struct {
 	SlidingWindow                 uint32  `json:"sliding_window"`
 }

-var _ Converter = (*phi3)(nil)
+var _ ModelConverter = (*phi3Model)(nil)

-func (p *phi3) KV(t *Tokenizer) llm.KV {
-	kv := p.Parameters.KV(t)
+func (p *phi3Model) KV(t *Tokenizer) llm.KV {
+	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
-	kv["general.name"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
 	kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
 	kv["phi3.feed_forward_length"] = p.IntermediateSize
@ -69,13 +68,12 @@ func (p *phi3) KV(t *Tokenizer) llm.KV {
 	return kv
 }

-func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
 	var addRopeFactors sync.Once

 	out := make([]llm.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
-		name := p.tensorName(t.Name())
-		if strings.HasPrefix(name, "blk.0.") {
+		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
 				out = append(out, llm.Tensor{
 					Name:     "rope_factors_long.weight",
@ -92,7 +90,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
 		}

 		out = append(out, llm.Tensor{
-			Name:     name,
+			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			WriterTo: t,
@ -102,8 +100,8 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
 	return out
 }

-func (p *phi3) tensorName(n string) string {
-	return strings.NewReplacer(
+func (p *phi3Model) Replacements() []string {
+	return []string{
 		"lm_head", "output",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
@ -114,7 +112,7 @@ func (p *phi3) tensorName(n string) string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.gate_up_proj", "ffn_up",
 		"post_attention_layernorm", "ffn_norm",
-	).Replace(n)
+	}
 }

 type ropeFactor []float32
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@ -1,7 +1,9 @@
 package convert

 import (
+	"bytes"
 	"crypto/sha256"
+	"encoding/binary"
 	"encoding/hex"
 	"encoding/json"
 	"flag"
@ -29,7 +31,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	}
 	defer f.Close()

-	if err := Convert(fsys, f); err != nil {
+	if err := ConvertModel(fsys, f); err != nil {
 		t.Fatal(err)
 	}

@ -51,37 +53,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
 	return r, m.KV(), m.Tensors()
 }

-func TestMain(m *testing.M) {
-	var level slog.Level
-	flag.TextVar(&level, "level", slog.LevelInfo, "log level")
-	flag.Parse()
-	slog.SetLogLoggerLevel(level)
-	os.Exit(m.Run())
-}
-
-func TestConvertFull(t *testing.T) {
-	cases := []string{
-		"Meta-Llama-3-8B-Instruct",
-		"Mistral-7B-Instruct-v0.2",
-		"Mixtral-8x7B-Instruct-v0.1",
-		"gemma-2b-it",
-		// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
-		"Phi-3-mini-128k-instruct",
-	}
-
-	for i := range cases {
-		tt := cases[i]
-		t.Run(tt, func(t *testing.T) {
-			t.Parallel()
-
-			p := filepath.Join("testdata", tt)
-			if testing.Short() {
-				t.Skip("skipping in short mode")
-			} else if _, err := os.Stat(p); err != nil {
-				t.Skipf("%s not found", p)
-			}
-
-			f, kv, tensors := convertFull(t, os.DirFS(p))
+func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string {
 	actual := make(map[string]string)
 	for k, v := range kv {
 		if s, ok := v.(json.Marshaler); !ok {
@ -106,6 +78,45 @@ func TestConvertFull(t *testing.T) {
 		actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
 	}

+	return actual
+}
+
+func TestMain(m *testing.M) {
+	var level slog.Level
+	flag.TextVar(&level, "level", slog.LevelInfo, "log level")
+	flag.Parse()
+	slog.SetLogLoggerLevel(level)
+	os.Exit(m.Run())
+}
+
+func TestConvertFull(t *testing.T) {
+	cases := []string{
+		"Meta-Llama-3-8B-Instruct",
+		"Meta-Llama-3.1-8B-Instruct",
+		"Mistral-7B-Instruct-v0.2",
+		"Mixtral-8x7B-Instruct-v0.1",
+		"gemma-2b-it",
+		// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
+		"Phi-3-mini-128k-instruct",
+		"all-MiniLM-L6-v2",
+		"gemma-2-9b-it",
+	}
+
+	for i := range cases {
+		tt := cases[i]
+		t.Run(tt, func(t *testing.T) {
+			t.Parallel()
+
+			p := filepath.Join("testdata", tt)
+			if testing.Short() {
+				t.Skip("skipping in short mode")
+			} else if _, err := os.Stat(p); err != nil {
+				t.Skipf("%s not found", p)
+			}
+
+			f, kv, tensors := convertFull(t, os.DirFS(p))
+			actual := generateResultsJSON(t, f, kv, tensors)
+
 			expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
 			if err != nil {
 				t.Fatal(err)
@ -128,3 +139,209 @@ func TestConvertFull(t *testing.T) {
 		})
 	}
 }
+
+func TestConvertAdapter(t *testing.T) {
+	type AdapterCase struct {
+		Name     string
+		BaseKV   map[string]any
+		Expected map[string]string
+	}
+
+	cases := []AdapterCase{
+		{
+			Name: "discollama",
+			BaseKV: map[string]any{
+				"general.architecture":          "llama",
+				"llama.attention.head_count":    uint32(32),
+				"llama.attention.head_count_kv": uint32(8),
+			},
+			Expected: map[string]string{
+				"general.architecture":          "llama",
+				"general.file_type":             "1",
+				"general.parameter_count":       "106496",
+				"general.type":                  "adapter",
+				"general.version":               "v0.2",
+				"adapter.lora.alpha":            "16",
+				"adapter.type":                  "lora",
+				"llama.attention.head_count":    "32",
+				"llama.attention.head_count_kv": "8",
+				"blk.31.attn_q.weight.lora_a":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
+				"blk.31.attn_q.weight.lora_b":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
+				"blk.31.attn_v.weight.lora_a":   "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
+				"blk.31.attn_v.weight.lora_b":   "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857",
+			},
+		},
+	}
+
+	for _, c := range cases {
+		t.Run(c.Name, func(t *testing.T) {
+			t.Parallel()
+
+			f, err := os.CreateTemp(t.TempDir(), "f16")
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer f.Close()
+
+			tempDir := t.TempDir()
+			generateLoraTestData(t, tempDir)
+
+			if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
+				t.Fatal(err)
+			}
+
+			r, err := os.Open(f.Name())
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer r.Close()
+
+			m, _, err := llm.DecodeGGML(r, math.MaxInt)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if _, err := r.Seek(0, io.SeekStart); err != nil {
+				t.Fatal(err)
+			}
+
+			actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
+
+			keys := maps.Keys(c.Expected)
+			slices.Sort(keys)
+			for _, k := range keys {
+				if v, ok := actual[k]; !ok {
+					t.Errorf("missing %s", k)
+				} else if v != c.Expected[k] {
+					t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
+				}
+			}
+		})
+	}
+}
+
+func generateLoraTestData(t *testing.T, tempDir string) {
+	type tensorData struct {
+		Offsets []int  `json:"data_offsets"`
+		Type    string `json:"dtype"`
+		Shape   []int  `json:"shape"`
+	}
+	offset := 4096 * 8 * 4
+
+	td := map[string]*tensorData{"__metadata__": nil}
+	td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{
+		Offsets: []int{0, offset},
+		Type:    "F32",
+		Shape:   []int{4096, 8},
+	}
+	td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{
+		Offsets: []int{offset, offset * 2},
+		Type:    "F32",
+		Shape:   []int{8, 4096},
+	}
+	td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{
+		Offsets: []int{offset * 2, offset * 3},
+		Type:    "F32",
+		Shape:   []int{4096, 8},
+	}
+	td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{
+		Offsets: []int{offset * 3, offset*3 + 8*1024*4},
+		Type:    "F32",
+		Shape:   []int{8, 1024},
+	}
+
+	data, err := json.Marshal(td)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var buf bytes.Buffer
+
+	l := int64(len(data))
+	err = binary.Write(&buf, binary.LittleEndian, l)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = buf.Write(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// write some data for the tensors
+
+	ones := make([]float32, 4096*8)
+	for i := range ones {
+		ones[i] = float32(1)
+	}
+
+	for range 3 {
+		err = binary.Write(&buf, binary.LittleEndian, ones)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	ones = make([]float32, 1024*8)
+	for i := range ones {
+		ones[i] = float32(1)
+	}
+
+	err = binary.Write(&buf, binary.LittleEndian, ones)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer fdata.Close()
+
+	_, err = fdata.Write(buf.Bytes())
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	configData := `
+{
+    "adapter_path": "adapters-test",
+    "batch_size": 8,
+    "config": "config-tiny.json",
+    "data": "../discollama-completion",
+    "grad_checkpoint": null,
+    "iters": 1000,
+    "learning_rate": 1e-05,
+    "lora_layers": 1,
+    "lora_parameters": {
+        "rank": 8,
+        "alpha": 16,
+        "dropout": 0.0,
+        "scale": 2.0
+    },
+    "lr_schedule": null,
+    "max_seq_length": 2048,
+    "model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct",
+    "resume_adapter_file": null,
+    "save_every": 100,
+    "seed": 0,
+    "steps_per_eval": 200,
+    "steps_per_report": 10,
+    "test": false,
+    "test_batches": 500,
+    "train": true,
+    "use_dora": false,
+    "val_batches": 25
+}
+`
+	f, err := os.Create(filepath.Join(tempDir, "adapter_config.json"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	_, err = f.WriteString(configData)
+	if err != nil {
+		t.Fatal(err)
+	}
+}
--- a/convert/reader.go
+++ b/convert/reader.go
@ -35,7 +35,9 @@ const (
 )

 func (t tensorBase) Kind() uint32 {
-	if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
+	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
+		t.name == "token_types.weight" {
+		// these tensors are always F32
 		return 0
 	}

@ -55,13 +57,15 @@ func (t *tensorBase) SetRepacker(fn repacker) {

 type repacker func(string, []float32, []uint64) ([]float32, error)

-func parseTensors(fsys fs.FS) ([]Tensor, error) {
+func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
 	patterns := []struct {
 		Pattern string
-		Func    func(fs.FS, ...string) ([]Tensor, error)
+		Func    func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
 	}{
 		{"model-*-of-*.safetensors", parseSafetensors},
 		{"model.safetensors", parseSafetensors},
+		{"adapters.safetensors", parseSafetensors},
+		{"adapter_model.safetensors", parseSafetensors},
 		{"pytorch_model-*-of-*.bin", parseTorch},
 		{"pytorch_model.bin", parseTorch},
 		{"consolidated.*.pth", parseTorch},
@ -74,7 +78,7 @@ func parseTensors(fsys fs.FS) ([]Tensor, error) {
 		}

 		if len(matches) > 0 {
-			return pattern.Func(fsys, matches...)
+			return pattern.Func(fsys, replacer, matches...)
 		}
 	}

--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@ -8,6 +8,7 @@ import (
 	"io"
 	"io/fs"
 	"slices"
+	"strings"

 	"github.com/d4l3k/go-bfloat16"
 	"github.com/x448/float16"
@ -20,7 +21,7 @@ type safetensorMetadata struct {
 	Offsets []int64  `json:"data_offsets"`
 }

-func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
+func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
 	var ts []Tensor
 	for _, p := range ps {
 		f, err := fsys.Open(p)
@ -56,7 +57,7 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
 					offset: safetensorsPad(n, value.Offsets[0]),
 					size:   safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
 					tensorBase: &tensorBase{
-						name:  key,
+						name:  replacer.Replace(key),
 						shape: value.Shape,
 					},
 				})
--- a/convert/reader_torch.go
+++ b/convert/reader_torch.go
@ -3,12 +3,13 @@ package convert
 import (
 	"io"
 	"io/fs"
+	"strings"

 	"github.com/nlpodyssey/gopickle/pytorch"
 	"github.com/nlpodyssey/gopickle/types"
 )

-func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
+func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
 	var ts []Tensor
 	for _, p := range ps {
 		pt, err := pytorch.Load(p)
@ -27,7 +28,7 @@ func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
 			ts = append(ts, torch{
 				storage: t.(*pytorch.Tensor).Source,
 				tensorBase: &tensorBase{
-					name:  k.(string),
+					name:  replacer.Replace(k.(string)),
 					shape: shape,
 				},
 			})
--- a/convert/testdata/Meta-Llama-3.1-8B-Instruct.json
+++ b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json
@ -0,0 +1,3 @@
+{
+  "rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222"
+}
--- a/convert/testdata/all-MiniLM-L6-v2.json
+++ b/convert/testdata/all-MiniLM-L6-v2.json
@ -0,0 +1,124 @@
+{
+  "general.architecture": "bert",
+  "general.file_type": "1",
+  "general.quantization_version": "2",
+  "bert.attention.causal": "false",
+  "bert.attention.head_count": "12",
+  "bert.attention.layer_norm_epsilon": "1e-12",
+  "bert.block_count": "6",
+  "bert.context_length": "512",
+  "bert.embedding_length": "384",
+  "bert.feed_forward_length": "1536",
+  "bert.pooling_type": "1",
+  "tokenizer.ggml.model": "bert",
+  "tokenizer.ggml.padding_token_id": "0",
+  "tokenizer.ggml.unknown_token_id": "100",
+  "tokenizer.ggml.cls_token_id": "101",
+  "tokenizer.ggml.seperator_token_id": "102",
+  "tokenizer.ggml.mask_token_id": "103",
+  "tokenizer.ggml.token_type_count": "2",
+  "tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f",
+  "tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1",
+  "tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86",
+  "token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a",
+  "position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6",
+  "token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21",
+  "token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4",
+  "token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d",
+  "blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c",
+  "blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62",
+  "blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba",
+  "blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9",
+  "blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3",
+  "blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb",
+  "blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802",
+  "blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701",
+  "blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62",
+  "blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50",
+  "blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874",
+  "blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34",
+  "blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca",
+  "blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054",
+  "blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8",
+  "blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a",
+  "blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10",
+  "blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069",
+  "blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d",
+  "blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f",
+  "blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21",
+  "blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199",
+  "blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73",
+  "blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d",
+  "blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8",
+  "blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d",
+  "blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7",
+  "blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a",
+  "blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613",
+  "blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88",
+  "blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd",
+  "blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164",
+  "blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e",
+  "blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9",
+  "blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289",
+  "blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d",
+  "blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993",
+  "blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab",
+  "blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79",
+  "blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3",
+  "blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21",
+  "blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08",
+  "blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59",
+  "blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252",
+  "blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef",
+  "blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27",
+  "blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35",
+  "blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247",
+  "blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6",
+  "blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb",
+  "blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18",
+  "blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e",
+  "blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b",
+  "blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1",
+  "blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502",
+  "blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae",
+  "blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0",
+  "blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40",
+  "blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0",
+  "blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b",
+  "blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847",
+  "blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de",
+  "blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482",
+  "blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab",
+  "blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc",
+  "blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb",
+  "blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16",
+  "blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60",
+  "blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a",
+  "blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6",
+  "blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44",
+  "blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222",
+  "blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44",
+  "blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374",
+  "blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223",
+  "blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017",
+  "blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21",
+  "blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1",
+  "blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f",
+  "blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c",
+  "blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d",
+  "blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b",
+  "blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5",
+  "blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77",
+  "blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923",
+  "blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c",
+  "blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471",
+  "blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219",
+  "blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271",
+  "blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712",
+  "blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430",
+  "blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370",
+  "blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368",
+  "blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291",
+  "blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a",
+  "blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766"
+}
--- a/convert/testdata/gemma-2-9b-it.json
+++ b/convert/testdata/gemma-2-9b-it.json
@ -0,0 +1,6 @@
+{
+  "general.architecture": "gemma2",
+  "gemma2.attention.sliding_window": "4096",
+  "gemma2.attn_logit_softcapping": "50",
+  "gemma2.final_logit_softcapping": "30"
+}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@ -1,7 +1,6 @@
 package convert

 import (
-	"cmp"
 	"crypto/sha256"
 	"encoding/hex"
 	"encoding/json"
@ -11,6 +10,8 @@ import (
 	"log/slog"
 	"os"
 	"slices"
+
+	"golang.org/x/exp/maps"
 )

 const (
@ -184,32 +185,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
 		return nil, err
 	}

-	var tokens []token
+	tokens := make(map[int]token, len(t.Model.Vocab))
 	for k, v := range t.Model.Vocab {
-		tokens = append(tokens, token{
+		tokens[v] = token{
 			ID:      v,
 			Content: k,
-		})
+		}
 	}

-	for _, t := range t.AddedTokens {
-		t.UserDefined = true
-		tokens = append(tokens, t)
+	for _, token := range t.AddedTokens {
+		token.UserDefined = true
+		tokens[token.ID] = token
 	}

-	slices.SortFunc(tokens, func(i, j token) int {
-		return cmp.Compare(i.ID, j.ID)
-	})
+	keys := maps.Keys(tokens)
+	slices.Sort(keys)

 	v := Vocabulary{Model: "gpt2"}
-	for _, t := range tokens {
-		v.Tokens = append(v.Tokens, t.Content)
-		v.Scores = append(v.Scores, float32(t.ID))
+	for _, k := range keys {
+		token := tokens[k]
+		v.Tokens = append(v.Tokens, token.Content)
+		v.Scores = append(v.Scores, float32(token.ID))

 		switch {
-		case t.Special:
+		case token.Special:
 			v.Types = append(v.Types, tokenTypeControl)
-		case t.UserDefined:
+		case token.UserDefined:
 			v.Types = append(v.Types, tokenTypeUserDefined)
 		default:
 			v.Types = append(v.Types, tokenTypeNormal)
--- a/convert/tokenizer_spm.go
+++ b/convert/tokenizer_spm.go
@ -15,6 +15,11 @@ import (
 )

 func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
+	ast, err := parseAdditionalSpecialTokens(fsys)
+	if err != nil {
+		return nil, err
+	}
+
 	bts, err := fs.ReadFile(fsys, "tokenizer.model")
 	if err != nil {
 		return nil, err
@ -37,7 +42,12 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 			sentencepiece.ModelProto_SentencePiece_BYTE:
 			v.Types = append(v.Types, int32(t))
 		default:
-			v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL))
+			tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
+			if slices.Contains(ast, piece.GetPiece()) {
+				tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
+			}
+
+			v.Types = append(v.Types, tt)
 		}
 	}

@ -81,3 +91,23 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {

 	return &v, nil
 }
+
+func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
+	f, err := fsys.Open("special_tokens_map.json")
+	if errors.Is(err, os.ErrNotExist) {
+		return nil, nil
+	} else if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	var m struct {
+		AdditionalSpecialTokens []string `json:"additional_special_tokens"`
+	}
+
+	if err := json.NewDecoder(f).Decode(&m); err != nil {
+		return nil, err
+	}
+
+	return m.AdditionalSpecialTokens, nil
+}
--- a/docs/faq.md
+++ b/docs/faq.md
@ -111,7 +111,10 @@ On Windows, Ollama inherits your user and system environment variables.

 ## How do I use Ollama behind a proxy?

-Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
+Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
+
+> [!NOTE]
+> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.

 ### How do I use Ollama behind a proxy in Docker?

--- a/docs/linux.md
+++ b/docs/linux.md
@ -20,13 +20,12 @@ GPU.

 ## Manual install

-### Download the `ollama` binary
+### Download `ollama`

-Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
+Download and extract the Linux package:

 ```bash
-sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
-sudo chmod +x /usr/bin/ollama
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
 ```

 ### Adding Ollama as a startup service (recommended)
@ -96,8 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh
 Or by downloading the ollama binary:

 ```bash
-sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
-sudo chmod +x /usr/bin/ollama
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
 ```

 ## Installing specific versions
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -174,7 +174,7 @@ func RunnersDir() (p string) {

 	defer func() {
 		if p == "" {
-			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
+			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
 		}
 	}()

@ -190,17 +190,17 @@ func RunnersDir() (p string) {
 	}

 	var paths []string
-	for _, root := range []string{filepath.Dir(exe), cwd} {
+	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} {
 		paths = append(paths,
 			root,
-			filepath.Join(root, "windows-"+runtime.GOARCH),
-			filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
+			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
+			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
 		)
 	}

 	// Try a few variations to improve developer experience when building from source in the local tree
 	for _, path := range paths {
-		candidate := filepath.Join(path, "ollama_runners")
+		candidate := filepath.Join(path, "lib", "ollama", "runners")
 		if _, err := os.Stat(candidate); err == nil {
 			p = candidate
 			break
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) {
 	// Installer payload location if we're running the installed binary
 	exe, err := os.Executable()
 	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
 		if rocmLibUsable(rocmTargetDir) {
 			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
 			return rocmTargetDir, nil
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
 	// Installer payload (if we're running from some other location)
 	localAppData := os.Getenv("LOCALAPPDATA")
 	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, "rocm")
+	rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
--- a/gpu/cuda_common.go
+++ b/gpu/cuda_common.go
@ -4,9 +4,17 @@ package gpu

 import (
 	"log/slog"
+	"os"
+	"regexp"
+	"runtime"
+	"strconv"
 	"strings"
 )

+// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
+// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
+var CudaTegra string = os.Getenv("JETSON_JETPACK")
+
 func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	}
 	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
+
+func cudaVariant(gpuInfo CudaGPUInfo) string {
+	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
+		if CudaTegra != "" {
+			ver := strings.Split(CudaTegra, ".")
+			if len(ver) > 0 {
+				return "jetpack" + ver[0]
+			}
+		} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
+			r := regexp.MustCompile(` R(\d+) `)
+			m := r.FindSubmatch(data)
+			if len(m) != 2 {
+				slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
+			} else {
+				if l4t, err := strconv.Atoi(string(m[1])); err == nil {
+					// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
+					// https://developer.nvidia.com/embedded/jetpack-archive
+					switch l4t {
+					case 35:
+						return "jetpack5"
+					case 36:
+						return "jetpack6"
+					default:
+						slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
+					}
+				}
+			}
+		}
+	}
+
+	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
+		return "v11"
+	}
+	return "v12"
+}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -64,10 +64,6 @@ var RocmComputeMin = 9
 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU

-// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
-// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
-var CudaTegra string = os.Getenv("JETSON_JETPACK")
-
 // Note: gpuMutex must already be held
 func initCudaHandles() *cudaHandles {
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
@ -215,7 +211,7 @@ func GetGPUInfo() GpuInfoList {
 				GpuInfo: GpuInfo{
 					memInfo: mem,
 					Library: "cpu",
-					Variant: cpuCapability,
+					Variant: cpuCapability.String(),
 					ID:      "0",
 				},
 			},
@ -229,11 +225,7 @@ func GetGPUInfo() GpuInfoList {
 			return GpuInfoList{cpus[0].GpuInfo}
 		}

-		// On windows we bundle the nvidia library one level above the runner dir
-		depPath := ""
-		if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
-			depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
-		}
+		depPath := LibraryDir()

 		// Load ALL libraries
 		cHandles = initCudaHandles()
@ -269,11 +261,23 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.FreeMemory = uint64(memInfo.free)
 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
+				gpuInfo.computeMajor = int(memInfo.major)
+				gpuInfo.computeMinor = int(memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
-				gpuInfo.DependencyPath = depPath
-				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
+				variant := cudaVariant(gpuInfo)
+				if depPath != "" {
+					gpuInfo.DependencyPath = depPath
+					// Check for variant specific directory
+					if variant != "" {
+						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
+							gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
+						}
+					}
+				}
+				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+				gpuInfo.Variant = variant

 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
@ -306,13 +310,6 @@ func GetGPUInfo() GpuInfoList {
 		if envconfig.IntelGPU() {
 			oHandles = initOneAPIHandles()
 			if oHandles != nil && oHandles.oneapi != nil {
-
-				// On windows we bundle the oneapi library one level above the runner dir
-				depPath = ""
-				if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
-					depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
-				}
-
 				for d := range oHandles.oneapi.num_drivers {
 					if oHandles.oneapi == nil {
 						// shouldn't happen
@ -467,10 +464,12 @@ func GetGPUInfo() GpuInfoList {
 func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
 	var ldPaths []string
-	var patterns []string
 	gpuLibPaths := []string{}
 	slog.Debug("Searching for GPU library", "name", baseLibName)

+	// Start with our bundled libraries
+	patterns := []string{filepath.Join(LibraryDir(), baseLibName)}
+
 	switch runtime.GOOS {
 	case "windows":
 		ldPaths = strings.Split(os.Getenv("PATH"), ";")
@ -479,13 +478,14 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	default:
 		return gpuLibPaths
 	}
-	// Start with whatever we find in the PATH/LD_LIBRARY_PATH
+
+	// Then with whatever we find in the PATH/LD_LIBRARY_PATH
 	for _, ldPath := range ldPaths {
 		d, err := filepath.Abs(ldPath)
 		if err != nil {
 			continue
 		}
-		patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
+		patterns = append(patterns, filepath.Join(d, baseLibName))
 	}
 	patterns = append(patterns, defaultPatterns...)
 	slog.Debug("gpu library search", "globs", patterns)
@ -641,3 +641,31 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 		return "", ""
 	}
 }
+
+func LibraryDir() string {
+	// On Windows/linux we bundle the dependencies at the same level as the executable
+	appExe, err := os.Executable()
+	if err != nil {
+		slog.Warn("failed to lookup executable path", "error", err)
+	}
+	cwd, err := os.Getwd()
+	if err != nil {
+		slog.Warn("failed to lookup working directory", "error", err)
+	}
+	// Scan for any of our dependeices, and pick first match
+	for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} {
+		libDep := filepath.Join("lib", "ollama")
+		if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
+			return filepath.Join(root, libDep)
+		}
+		// Developer mode, local build
+		if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
+			return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
+		}
+		if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
+			return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
+		}
+	}
+	slog.Warn("unable to locate gpu dependency libraries")
+	return ""
+}
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
-				Variant: GetCPUCapability(),
+				Variant: GetCPUCapability().String(),
 				memInfo: mem,
 			},
 		}
@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList {
 	return []GpuInfo{
 		{
 			Library: "cpu",
-			Variant: GetCPUCapability(),
+			Variant: GetCPUCapability().String(),
 			memInfo: mem,
 		},
 	}
--- a/gpu/gpu_linux.go
+++ b/gpu/gpu_linux.go
@ -47,7 +47,7 @@ var (
 	CudartMgmtName = "libcudart.so*"
 	NvcudaMgmtName = "libcuda.so*"
 	NvmlMgmtName   = "" // not currently wired on linux
-	OneapiMgmtName = "libze_intel_gpu.so"
+	OneapiMgmtName = "libze_intel_gpu.so*"
 )

 func GetCPUMem() (memInfo, error) {
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@ -32,4 +32,29 @@ func TestCPUMemInfo(t *testing.T) {
 	}
 }

+func TestByLibrary(t *testing.T) {
+	type testCase struct {
+		input  []GpuInfo
+		expect int
+	}
+
+	testCases := map[string]*testCase{
+		"empty":                    {input: []GpuInfo{}, expect: 0},
+		"cpu":                      {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
+		"cpu + GPU":                {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
+		"cpu + 2 GPU no variant":   {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
+		"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
+		"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
+	}
+
+	for k, v := range testCases {
+		t.Run(k, func(t *testing.T) {
+			resp := (GpuInfoList)(v.input).ByLibrary()
+			if len(resp) != v.expect {
+				t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
+			}
+		})
+	}
+}
+
 // TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/gpu/types.go
+++ b/gpu/types.go
@ -19,7 +19,7 @@ type GpuInfo struct {
 	Library string `json:"library,omitempty"`

 	// Optional variant to select (e.g. versions, cpu feature flags)
-	Variant CPUCapability `json:"variant"`
+	Variant string `json:"variant"`

 	// MinimumMemory represents the minimum memory required to use the GPU
 	MinimumMemory uint64 `json:"-"`
@ -55,6 +55,8 @@ type CudaGPUInfo struct {
 	GpuInfo
 	OSOverhead   uint64 // Memory overhead between the driver library and management library
 	index        int    //nolint:unused,nolintlint
+	computeMajor int    //nolint:unused,nolintlint
+	computeMinor int    //nolint:unused,nolintlint
 }
 type CudaGPUInfoList []CudaGPUInfo

@ -81,8 +83,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != CPUCapabilityNone {
-			requested += "_" + info.Variant.String()
+		if info.Variant != CPUCapabilityNone.String() {
+			requested += "_" + info.Variant
 		}
 		for i, lib := range libs {
 			if lib == requested {
@ -92,7 +94,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 			}
 		}
 		if !found {
-			libs = append(libs, info.Library)
+			libs = append(libs, requested)
 			resp = append(resp, []GpuInfo{info})
 		}
 	}
@ -105,6 +107,7 @@ func (l GpuInfoList) LogDetails() {
 		slog.Info("inference compute",
 			"id", g.ID,
 			"library", g.Library,
+			"variant", g.Variant,
 			"compute", g.Compute,
 			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
 			"name", g.Name,
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
 		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
 	}

-	if res.PromptEvalCount != 8 {
-		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 6 {
+		t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }

@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
 	}

-	if res.PromptEvalCount != 16 {
-		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
+	if res.PromptEvalCount != 12 {
+		t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }

--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@ -1,12 +1,13 @@
 set(TARGET ollama_llama_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@ -1429,7 +1429,13 @@ struct llama_server_context
        switch (task.type)
        {
            case TASK_TYPE_COMPLETION: {
-                server_slot *slot = prefix_slot(task.data["prompt"]);
+                server_slot *slot = nullptr;
+                if (task.embedding_mode) {
+                    // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
+                    slot = slots[0].available() ? &slots[0] : nullptr;
+                } else {
+                    slot = prefix_slot(task.data["prompt"]);
+                }
                if (slot == nullptr)
                {
                    // if no slot is available, we defer this task for processing later
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@ -9,11 +9,14 @@ init_vars() {
        ARCH="arm64"
        ;;
    *)
-        ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
+        echo "GOARCH must be set"
+        echo "this script is meant to be run from within go generate"
+        exit 1
+        ;;
    esac

    LLAMACPP_DIR=../llama.cpp
-    CMAKE_DEFS=""
+    CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
    CMAKE_TARGETS="--target ollama_llama_server"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
@ -27,6 +30,7 @@ init_vars() {
        WHOLE_ARCHIVE="-Wl,-force_load"
        NO_WHOLE_ARCHIVE=""
        GCC_ARCH="-arch ${ARCH}"
+        DIST_BASE=../../dist/darwin-${GOARCH}/
        ;;
    "Linux")
        LIB_EXT="so"
@ -35,6 +39,7 @@ init_vars() {

        # Cross compiling not supported on linux - Use docker
        GCC_ARCH=""
+        DIST_BASE=../../dist/linux-${GOARCH}/
        ;;
    *)
        ;;
@ -42,6 +47,7 @@ init_vars() {
    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
    fi
+    GZIP=$(which pigz 2>/dev/null || echo "gzip")
 }

 git_module_setup() {
@ -85,26 +91,36 @@ build() {

 compress() {
    echo "Compressing payloads to reduce overall binary size..."
-    pids=""
    rm -rf ${BUILD_DIR}/bin/*.gz
    for f in ${BUILD_DIR}/bin/* ; do
-        gzip -n --best -f ${f} &
-        pids+=" $!"
+        ${GZIP} -n --best -f ${f} &
+        compress_pids+=" $!"
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
-            gzip -n --best -f ${f} &
-            pids+=" $!"
+            ${GZIP} -n --best -f ${f} &
+            compress_pids+=" $!"
        done
    fi
    echo
-    for pid in ${pids}; do
+}
+
+wait_for_compress() {
+    for pid in ${compress_pids}; do
        wait $pid
    done
    echo "Finished compression"
 }

+install() {
+    echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
+    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
+        rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
+        cp -af "${lib}" "${BUILD_DIR}/bin/"
+    done
+}
+
 # Keep the local tree clean after we're done with the build
 cleanup() {
    (cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@ -6,6 +6,7 @@

 set -ex
 set -o pipefail
+compress_pids=""
 echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
@ -98,4 +99,5 @@ case "${GOARCH}" in
 esac

 cleanup
+wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -13,6 +13,7 @@

 set -ex
 set -o pipefail
+compress_pids=""

 # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
 amdGPUs() {
@ -51,7 +52,7 @@ if [ -z "${CUDACXX}" ]; then
        export CUDACXX=$(command -v nvcc)
    fi
 fi
-COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
+COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
@ -77,10 +78,11 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
        init_vars
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
-        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
+        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
        BUILD_DIR="../build/linux/${ARCH}/cpu"
        echo "Building custom CPU"
        build
+        install
        compress
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@ -93,7 +95,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
        # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake

-        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
+        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
            #
            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@ -103,6 +105,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
            BUILD_DIR="../build/linux/${ARCH}/cpu"
            echo "Building LCD CPU"
            build
+            install
            compress
        fi

@ -120,6 +123,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
                echo "Building AVX CPU"
                build
+                install
                compress
            fi

@ -133,6 +137,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
                echo "Building AVX2 CPU"
                build
+                install
                compress
            fi
        fi
@ -160,7 +165,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
    echo "CUDA libraries detected - building dynamic CUDA library"
    init_vars
    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
-    if [ -n "${CUDA_MAJOR}" ]; then
+    if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
    if [ "${ARCH}" == "arm64" ]; then
@ -178,29 +183,19 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
        echo "Building custom CUDA GPU"
    else
-        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
+        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
    fi
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
+    export CUDAFLAGS="-t8"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
-    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
+    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
+    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
    build
-
-    # Carry the CUDA libs as payloads to help reduce dependency burden on users
-    #
-    # TODO - in the future we may shift to packaging these separately and conditionally
-    #        downloading them in the install script.
-    DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
-    for lib in libcudart.so libcublas.so libcublasLt.so ; do
-        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
-        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
-            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
-        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
-            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
-        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
-            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
-        else
-            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
-        fi
+    install
+    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
+    mkdir -p "${CUDA_DIST_DIR}"
+    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
+        cp -a "${lib}" "${CUDA_DIST_DIR}"
    done
    compress

@ -218,21 +213,24 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    CC=icx
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
    BUILD_DIR="../build/linux/${ARCH}/oneapi"
-    EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
+    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
+    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
    build

    # copy oneAPI dependencies
+    mkdir -p "${ONEAPI_DIST_DIR}"
    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
-        cp "${dep}" "${BUILD_DIR}/bin/"
+        cp -a "${dep}" "${ONEAPI_DIST_DIR}"
    done
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
+    install
    compress
 fi

@ -254,7 +252,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
    fi
    init_vars
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
    if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
@ -262,23 +260,22 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        echo "Building custom ROCM GPU"
    fi
    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
-    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
+    # ROCm dependencies are too large to fit into a unified bundle
+    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
+    # TODO figure out how to disable runpath (rpath)
+    # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
+    export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build

-    # Record the ROCM dependencies
-    rm -f "${BUILD_DIR}/bin/deps.txt"
-    touch "${BUILD_DIR}/bin/deps.txt"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
-        echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
+    # copy the ROCM dependencies
+    mkdir -p "${ROCM_DIST_DIR}"
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
+        cp -a "${dep}"* "${ROCM_DIST_DIR}"
    done
-    # bomb out if for some reason we didn't get a few deps
-    if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
-        cat "${BUILD_DIR}/bin/deps.txt"
-        echo "ERROR: deps file short"
-        exit 1
-    fi
+    install
    compress
 fi

 cleanup
+wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -35,7 +35,7 @@ function init_vars {
        )
    $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
-    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
+    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
    md "$script:DIST_BASE" -ea 0 > $null
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
@ -117,7 +117,7 @@ function build {
    if ($cmakeDefs -contains "-G") {
        $extra=@("-j8")
    } else {
-        $extra= @("--", "/p:CL_MPcount=8")
+        $extra= @("--", "/maxCpuCount:8")
    }
    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
@ -261,7 +261,7 @@ function build_cuda() {
    if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
        # Then build cuda as a dynamically loaded library
        $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
-        $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
+        $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
        if ($null -ne $script:CUDA_VERSION) {
            $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
        }
@ -273,9 +273,9 @@ function build_cuda() {
            "-DGGML_CUDA=ON",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
-            "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
-            "-DCMAKE_CUDA_FLAGS=-t8",
-            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
+            "-DCMAKE_CUDA_FLAGS=-t6",
+            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
+            "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
            )
        if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
            write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
@ -286,12 +286,11 @@ function build_cuda() {
        sign
        install

-        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
+        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    } else {
        write-host "Skipping CUDA generation step"
    }
@ -325,18 +324,17 @@ function build_oneapi() {
    sign
    install

-    rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
+    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
  } else {
    Write-Host "Skipping oneAPI generation step"
  }
@ -357,7 +355,7 @@ function build_rocm() {
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DGGML_HIPBLAS=on",
-            "-DLLAMA_CUDA_NO_PEER_COPY=on",
+            "-DGGML_CUDA_NO_PEER_COPY=on",
            "-DHIP_PLATFORM=amd",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
@ -386,12 +384,11 @@ function build_rocm() {
        sign
        install

-        rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
-        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
-        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
+        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
+        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
+        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
        # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
-        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
+        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
    } else {
        write-host "Skipping ROCm generation step"
    }
--- a/llm/ggml.go
+++ b/llm/ggml.go
@ -43,6 +43,14 @@ func (kv KV) Architecture() string {
 	return "unknown"
 }

+func (kv KV) Kind() string {
+	if s, ok := kv["general.type"].(string); ok {
+		return s
+	}
+
+	return "unknown"
+}
+
 func (kv KV) ParameterCount() uint64 {
 	return kv.u64("general.parameter_count")
 }
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@ -33,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) {
 	assert.Len(t, tensors, inputLayerCount+1)
 	err = WriteGGUF(f, KV{
 		"general.architecture":          "llama",
-		"general.name":                  "name",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.block_count":             uint32(inputLayerCount),
--- a/llm/patches/08-pooling.diff
+++ b/llm/patches/08-pooling.diff
@ -1,60 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 721b8f4e..cfe7ac40 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -8420,14 +8420,14 @@ struct llm_build_context {
-     }
- 
-     struct ggml_tensor * build_inp_mean() {
-        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
-+        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
-         cb(lctx.inp_mean, "inp_mean", -1);
-         ggml_set_input(lctx.inp_mean);
-         return lctx.inp_mean;
-     }
- 
-     struct ggml_tensor * build_inp_cls() {
-        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-+        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
-         cb(lctx.inp_cls, "inp_cls", -1);
-         ggml_set_input(lctx.inp_cls);
-         return lctx.inp_cls;
-@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
-         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
- 
-         float * data = (float *) lctx.inp_mean->data;
-        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
-+        memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
- 
-         std::vector<uint64_t> sum(n_tokens, 0);
-         for (int i = 0; i < n_tokens; ++i) {
-             const llama_seq_id seq_id = batch.seq_id[i][0];
-
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
-             sum[seq_id] += 1;
-         }
- 
-        std::vector<float> div(n_tokens, 0.0f);
-        for (int i = 0; i < n_tokens; ++i) {
-+        std::vector<float> div(cparams.n_seq_max, 0.0f);
-+        for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
-             const uint64_t s = sum[i];
-             if (s > 0) {
-                 div[i] = 1.0f/float(s);
-@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
-         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
- 
-         uint32_t * data = (uint32_t *) lctx.inp_cls->data;
-        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
-+        memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
- 
-         for (int i = 0; i < n_tokens; ++i) {
-             const llama_seq_id seq_id = batch.seq_id[i][0];
-             const llama_pos    pos    = batch.pos[i];
-
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
-
-             if (pos == 0) {
-                 data[seq_id] = i;
-             }
--- a/llm/payload.go
+++ b/llm/payload.go
@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := getAvailableServers()
 	requested := info.Library
-	if info.Variant != gpu.CPUCapabilityNone {
-		requested += "_" + info.Variant.String()
+	if info.Variant != gpu.CPUCapabilityNone.String() {
+		requested += "_" + info.Variant
 	}

 	servers := []string{}
--- a/llm/server.go
+++ b/llm/server.go
@ -258,7 +258,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--mlock")
 	}

-	if gpu.IsNUMA() {
+	if gpu.IsNUMA() && gpus[0].Library == "cpu" {
 		numaMode := "distribute"
 		if runtime.GOOS == "linux" {
 			if _, err := exec.LookPath("numactl"); err == nil {
@ -306,20 +306,18 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
-		// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
-		libraryPaths := []string{dir, filepath.Dir(dir)}
+		// Start with the server directory for the LD_LIBRARY_PATH/PATH
+		libraryPaths := []string{dir}

 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-			// Append our runner directory to the path
-			// This will favor system libraries over our bundled library dependencies
+			// favor our bundled library dependencies over system libraries
 			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
 		}

 		// Note: we always put the dependency path first
-		// since this was the exact version we verified for AMD GPUs
-		// and we favor what the user had in their path
+		// since this was the exact version we compiled/linked against
 		if gpus[0].DependencyPath != "" {
-			// TODO refine for multi-gpu support
+			// assume gpus from the same library have the same dependency path
 			libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
 		}

--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@ -4,6 +4,7 @@ set -eu

 export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
+GZIP=$(which pigz 2>/dev/null || echo "gzip")

 BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
@ -21,11 +22,16 @@ for TARGETARCH in ${BUILD_ARCH}; do
        -t builder:$TARGETARCH \
        .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
-    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/ollama ./dist/ollama-linux-$TARGETARCH
-
-    if [ "$TARGETARCH" = "amd64" ]; then
-        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/deps/ ./dist/
+    rm -rf ./dist/linux-$TARGETARCH
+    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
+    if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
+        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
    fi
-
    docker rm builder-$TARGETARCH
+    echo "Compressing final linux bundle..."
+    rm -f ./dist/ollama-linux-$TARGETARCH.tgz
+    (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
+    if [ -d dist/linux-$TARGETARCH-rocm ]; then
+        (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
+    fi
 done
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@ -7,6 +7,7 @@
 $ErrorActionPreference = "Stop"

 function checkEnv() {
+    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
    $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
    Write-host "Building for ${script:TARGET_ARCH}"
    write-host "Locating required tools and paths"
@ -15,26 +16,23 @@ function checkEnv() {
        $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
        $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
    }
-    # Try to find the CUDA dir
-    if ($null -eq $env:NVIDIA_DIR) {
-        $d=(get-command -ea 'silentlycontinue' nvcc).path
-        if ($d -ne $null) {
-            $script:NVIDIA_DIR=($d| split-path -parent)
-        } else {
+    # Locate CUDA versions
+    # Note: this assumes every version found will be built
    $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
-            if ($cudaList.length > 0) {
-                $script:NVIDIA_DIR=$cudaList[0]
-            }
+    if ($cudaList.length -eq 0) {
+        $d=(get-command -ea 'silentlycontinue' nvcc).path
+        if ($null -ne $d) {
+            $script:CUDA_DIRS=@($d| split-path -parent)
        }
    } else {
-        $script:NVIDIA_DIR=$env:NVIDIA_DIR
+        $script:CUDA_DIRS=$cudaList
    }
    
    $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]

    $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
    $env:CGO_ENABLED="1"
-    echo "Checking version"
+    Write-Output "Checking version"
    if (!$env:VERSION) {
        $data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always)
        $pattern="v(.+)"
@ -71,7 +69,48 @@ function checkEnv() {
 function buildOllama() {
    write-host "Building ollama CLI"
    if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
+        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
+
+        # TODO - consider trying to parallelize this with Start-ThreadJob, but env vars can't be used to toggle
+        #        which targets to build
+
+        # Start by skipping CUDA to build everything else
+        pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
+        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
+
+        # Then skip everyhting else and build all the CUDA variants
+        foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
+            write-host "Building CUDA ${env:CUDA_LIB_DIR}"
+
+            if ($env:CUDA_LIB_DIR.Contains("v12")) {
+                pwsh -Command {
+                    $env:OLLAMA_SKIP_CUDA_GENERATE=""
+                    $env:OLLAMA_SKIP_STATIC_GENERATE="1"
+                    $env:OLLAMA_SKIP_CPU_GENERATE="1"
+                    $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
+                    $env:OLLAMA_SKIP_ROCM_GENERATE="1"
+                    $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
+                    $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on"
+                    $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
+                    $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
                    & go generate ./...
+                }
+            } else {
+                pwsh -Command {
+                    $env:OLLAMA_SKIP_CUDA_GENERATE=""
+                    $env:OLLAMA_SKIP_STATIC_GENERATE="1"
+                    $env:OLLAMA_SKIP_CPU_GENERATE="1"
+                    $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
+                    $env:OLLAMA_SKIP_ROCM_GENERATE="1"
+                    $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
+                    $env:OLLAMA_CUSTOM_CUDA_DEFS=""
+                    $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
+                    $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
+                    & go generate ./...
+                }
+            }
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        }
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
    } else {
        write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
@ -83,8 +122,8 @@ function buildOllama() {
            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    }
-    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
-    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
+    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force
+    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\
 }

 function buildApp() {
@ -103,22 +142,22 @@ function buildApp() {
 function gatherDependencies() {
    write-host "Gathering runtime dependencies"
    cd "${script:SRC_DIR}"
-    md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
+    md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null

    # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
    # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\"
+    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\"
    foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
-        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\"
+        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\"
    }


    cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
    if ("${env:KEY_CONTAINER}") {
        write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+        foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
            write-host "signing $file"
            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -63,16 +63,36 @@ if [ -n "$NEEDS" ]; then
    exit 1
 fi

-status "Downloading ollama..."
-curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
-
 for BINDIR in /usr/local/bin /usr/bin /bin; do
    echo $PATH | grep -q $BINDIR && break || continue
 done
+OLLAMA_INSTALL_DIR=$(dirname ${BINDIR})

-status "Installing ollama to $BINDIR..."
+status "Installing ollama to $OLLAMA_INSTALL_DIR"
 $SUDO install -o0 -g0 -m755 -d $BINDIR
-$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama
+$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR"
+if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then
+    status "Downloading Linux ${ARCH} bundle"
+    curl --fail --show-error --location --progress-bar \
+        "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
+        $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+    BUNDLE=1
+    if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
+        status "Making ollama accessible in the PATH in $BINDIR"
+        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
+    fi
+else
+    status "Downloading Linux ${ARCH} CLI"
+    curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
+    "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
+    $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
+    BUNDLE=0
+    if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
+        status "Making ollama accessible in the PATH in $BINDIR"
+        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
+    fi
+fi
+

 install_success() {
    status 'The Ollama API is now available at 127.0.0.1:11434.'
@ -178,6 +198,16 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg
 fi

 if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
+    if [ $BUNDLE -ne 0 ]; then
+        status "Downloading Linux ROCm ${ARCH} bundle"
+        curl --fail --show-error --location --progress-bar \
+            "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
+            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+
+        install_success
+        status "AMD GPU ready."
+        exit 0
+    fi
    # Look for pre-existing ROCm v6 before downloading the dependencies
    for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
        if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then
--- a/scripts/rh_linux_deps.sh
+++ b/scripts/rh_linux_deps.sh
@ -3,6 +3,7 @@
 # Script for common Dockerfile dependency installation in redhat linux based images

 set -ex
+set -o pipefail
 MACHINE=$(uname -m)

 if grep -i "centos" /etc/system-release >/dev/null; then
@ -29,7 +30,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
        dnf install -y rh-git227-git
        ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
    fi
-    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++
+    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz
 elif grep -i "rocky" /etc/system-release >/dev/null; then
    # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
    cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
@ -43,12 +44,21 @@ gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial
 EOF
    dnf install -y git \
        gcc-toolset-10-gcc-10.2.1-8.2.el8 \
-        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8
+        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
+        pigz
 else
    echo "ERROR Unexpected distro"
    exit 1
 fi

+if [ "${MACHINE}" = "x86_64" ] ; then
+    curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \
+    mv /tmp/ccache /usr/local/bin/
+else
+    yum -y install epel-release
+    yum install -y ccache
+fi
+
 if [ -n "${CMAKE_VERSION}" ]; then
    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
 fi
--- a/server/images.go
+++ b/server/images.go
@ -215,25 +215,20 @@ func GetManifest(mp ModelPath) (*Manifest, string, error) {
 		return nil, "", err
 	}

-	if _, err = os.Stat(fp); err != nil {
-		return nil, "", err
-	}
-
-	var manifest *Manifest
-
-	bts, err := os.ReadFile(fp)
+	f, err := os.Open(fp)
 	if err != nil {
-		return nil, "", fmt.Errorf("couldn't open file '%s'", fp)
+		return nil, "", err
 	}
+	defer f.Close()

-	shaSum := sha256.Sum256(bts)
-	shaStr := hex.EncodeToString(shaSum[:])
+	sha256sum := sha256.New()

-	if err := json.Unmarshal(bts, &manifest); err != nil {
+	var manifest Manifest
+	if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&manifest); err != nil {
 		return nil, "", err
 	}

-	return manifest, shaStr, nil
+	return &manifest, hex.EncodeToString(sha256sum.Sum(nil)), nil
 }

 func GetModel(name string) (*Model, error) {
@ -374,13 +369,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 	parameters := make(map[string]any)

 	var layers []Layer
+	var baseLayers []*layerGGML
 	for _, c := range modelfile.Commands {
 		mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
+		command := c.Name

-		switch c.Name {
+		switch command {
 		case "model", "adapter":
-			var baseLayers []*layerGGML
-			if name := model.ParseName(c.Args); name.IsValid() {
+			if name := model.ParseName(c.Args); name.IsValid() && command == "model" {
 				baseLayers, err = parseFromModel(ctx, name, fn)
 				if err != nil {
 					return err
@ -414,14 +410,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 				}
 				defer blob.Close()

-				baseLayers, err = parseFromFile(ctx, blob, digest, fn)
+				baseLayers, err = parseFromFile(ctx, command, baseLayers, blob, digest, fn)
 				if err != nil {
 					return err
 				}
 			} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
 				defer file.Close()

-				baseLayers, err = parseFromFile(ctx, file, "", fn)
+				baseLayers, err = parseFromFile(ctx, command, baseLayers, file, "", fn)
 				if err != nil {
 					return err
 				}
@ -692,43 +688,18 @@ func CopyModel(src, dst model.Name) error {
 	return err
 }

-func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error {
-	fp, err := GetManifestPath()
-	if err != nil {
-		return err
-	}
-
-	walkFunc := func(path string, info os.FileInfo, _ error) error {
-		if info.IsDir() {
-			return nil
-		}
-
-		dir, file := filepath.Split(path)
-		dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
-		tag := strings.Join([]string{dir, file}, ":")
-		fmp := ParseModelPath(tag)
-
-		// skip the manifest we're trying to delete
-		if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
-			return nil
-		}
-
-		// save (i.e. delete from the deleteMap) any files used in other manifests
-		manifest, _, err := GetManifest(fmp)
+func deleteUnusedLayers(deleteMap map[string]struct{}) error {
+	manifests, err := Manifests()
 	if err != nil {
 		return err
 	}

+	for _, manifest := range manifests {
 		for _, layer := range manifest.Layers {
 			delete(deleteMap, layer.Digest)
 		}

 		delete(deleteMap, manifest.Config.Digest)
-		return nil
-	}
-
-	if err := filepath.Walk(fp, walkFunc); err != nil {
-		return err
 	}

 	// only delete the files which are still in the deleteMap
@ -781,8 +752,7 @@ func PruneLayers() error {

 	slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))

-	err = deleteUnusedLayers(nil, deleteMap)
-	if err != nil {
+	if err := deleteUnusedLayers(deleteMap); err != nil {
 		slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
 		return nil
 	}
@ -877,20 +847,14 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)

-	var manifest *Manifest
-	var err error
-	var noprune string
-
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]struct{})
-
-	if !envconfig.NoPrune() {
-		manifest, _, err = GetManifest(mp)
-		if err != nil && !errors.Is(err, os.ErrNotExist) {
+	manifest, _, err := GetManifest(mp)
+	if errors.Is(err, os.ErrNotExist) {
+		// noop
+	} else if err != nil && !errors.Is(err, os.ErrNotExist) {
 		return err
-		}
-
-		if manifest != nil {
+	} else {
 		for _, l := range manifest.Layers {
 			deleteMap[l.Digest] = struct{}{}
 		}
@ -898,7 +862,6 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 			deleteMap[manifest.Config.Digest] = struct{}{}
 		}
 	}
-	}

 	if mp.ProtocolScheme == "http" && !regOpts.Insecure {
 		return errors.New("insecure protocol http")
@ -975,11 +938,9 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		return err
 	}

-	if noprune == "" {
-		fn(api.ProgressResponse{Status: "removing any unused layers"})
-		err = deleteUnusedLayers(nil, deleteMap)
-		if err != nil {
-			slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
+	if !envconfig.NoPrune() && len(deleteMap) > 0 {
+		fn(api.ProgressResponse{Status: "removing unused layers"})
+		if err := deleteUnusedLayers(deleteMap); err != nil {
 			fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
 		}
 	}
@ -1000,12 +961,12 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptio
 	}
 	defer resp.Body.Close()

-	var m *Manifest
+	var m Manifest
 	if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
 		return nil, err
 	}

-	return m, err
+	return &m, err
 }

 // GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer
--- a/server/layer.go
+++ b/server/layer.go
@ -51,6 +51,9 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) {
 		if err := os.Rename(temp.Name(), blob); err != nil {
 			return Layer{}, err
 		}
+		if err := os.Chmod(blob, 0o644); err != nil {
+			return Layer{}, err
+		}
 	}

 	return Layer{
--- a/server/manifest.go
+++ b/server/manifest.go
@ -5,6 +5,7 @@ import (
 	"encoding/hex"
 	"encoding/json"
 	"errors"
+	"fmt"
 	"io"
 	"log/slog"
 	"os"
@ -150,14 +151,16 @@ func Manifests() (map[model.Name]*Manifest, error) {

 			n := model.ParseNameFromFilepath(rel)
 			if !n.IsValid() {
-				slog.Warn("bad manifest name", "path", rel, "error", err)
+				slog.Warn("bad manifest name", "path", rel)
 				continue
 			}

 			m, err := ParseNamedManifest(n)
-			if err != nil {
+			if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
 				slog.Warn("bad manifest", "name", n, "error", err)
 				continue
+			} else if err != nil {
+				return nil, fmt.Errorf("%s: %w", n, err)
 			}

 			ms[n] = m
--- a/server/model.go
+++ b/server/model.go
@ -81,7 +81,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 	return layers, nil
 }

-func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 	fi, err := f.Stat()
 	if err != nil {
 		return nil, err
@ -108,16 +108,38 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.
 	defer t.Close()
 	defer os.Remove(t.Name())

-	fn(api.ProgressResponse{Status: "converting model"})
-	if err := convert.Convert(convert.NewZipReader(r, p, 32<<20), t); err != nil {
+	var layerType string
+
+	switch command {
+	case "adapter":
+		var baseModel *llm.GGML
+		for _, l := range baseLayers {
+			if l.GGML != nil {
+				baseModel = l.GGML
+				break
+			}
+		}
+
+		if baseModel == nil {
+			return nil, fmt.Errorf("no base model specified for the adapter")
+		}
+
+		if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil {
 			return nil, err
 		}
+		layerType = "application/vnd.ollama.image.adapter"
+	case "model":
+		if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil {
+			return nil, err
+		}
+		layerType = "application/vnd.ollama.image.model"
+	}

 	if _, err := t.Seek(0, io.SeekStart); err != nil {
 		return nil, err
 	}

-	layer, err := NewLayer(t, "application/vnd.ollama.image.model")
+	layer, err := NewLayer(t, layerType)
 	if err != nil {
 		return nil, err
 	}
@ -139,7 +161,7 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.
 	return detectChatTemplate(layers)
 }

-func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
+func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 	sr := io.NewSectionReader(file, 0, 512)
 	contentType, err := detectContentType(sr)
 	if err != nil {
@ -150,7 +172,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 	case "gguf", "ggla":
 		// noop
 	case "application/zip":
-		return parseFromZipFile(ctx, file, digest, fn)
+		return parseFromZipFile(ctx, command, baseLayers, file, digest, fn)
 	default:
 		return nil, fmt.Errorf("unsupported content type: %s", contentType)
 	}
@ -170,7 +192,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
 		}

 		mediatype := "application/vnd.ollama.image.model"
-		if ggml.Name() == "ggla" {
+		if ggml.Name() == "ggla" || ggml.KV().Kind() == "adapter" {
 			mediatype = "application/vnd.ollama.image.adapter"
 		} else if ggml.KV().Architecture() == "clip" {
 			mediatype = "application/vnd.ollama.image.projector"
--- a/server/model_test.go
+++ b/server/model_test.go
@ -153,7 +153,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
 		t.Fatalf("failed to seek to start: %v", err)
 	}

-	layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {})
+	layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, "", func(api.ProgressResponse) {})
 	if err != nil {
 		t.Fatalf("failed to parse from file: %v", err)
 	}
@ -166,7 +166,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
 		t.Fatalf("failed to seek to start: %v", err)
 	}

-	layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {})
+	layers2, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, layers[0].Digest, func(api.ProgressResponse) {})
 	if err != nil {
 		t.Fatalf("failed to parse from file: %v", err)
 	}
@ -206,7 +206,7 @@ func TestParseLayerFromCopy(t *testing.T) {
 		t.Fatalf("failed to seek to start: %v", err)
 	}

-	layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {})
+	layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file2, "", func(api.ProgressResponse) {})
 	if err != nil {
 		t.Fatalf("failed to parse from file: %v", err)
 	}
--- a/server/sched.go
+++ b/server/sched.go
@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}

+					// Embedding models should always be loaded with parallel=1
+					if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
+						numParallel = 1
+					}
+
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
 						// simplifying assumption of defaultParallel when in CPU mode
@ -734,7 +739,10 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL

 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
 func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+	if *numParallel <= 0 {
 		*numParallel = 1
+		req.opts.NumCtx = req.origNumCtx
+	}
 	byLibrary := gpus.ByLibrary()
 	if len(byLibrary) <= 1 {
 		return gpus
--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -117,7 +117,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est

 	require.NoError(t, llm.WriteGGUF(f, llm.KV{
 		"general.architecture":          "llama",
-		"general.name":                  "name",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.block_count":             uint32(1),
--- a/server/upload.go
+++ b/server/upload.go
@ -45,7 +45,7 @@ type blobUpload struct {
 }

 const (
-	numUploadParts          = 64
+	numUploadParts          = 16
 	minUploadPartSize int64 = 100 * format.MegaByte
 	maxUploadPartSize int64 = 1000 * format.MegaByte
 )
Author	SHA1	Message	Date
baalajimaestro	0c61920bc9	Merge https://github.com/ollama/ollama Signed-off-by: baalajimaestro <me@baalajimaestro.me>	2024-08-25 22:02:07 +05:30
Daniel Hiltgen	0f92b19bec	Only enable numa on CPUs (#6484 ) The numa flag may be having a performance impact on multi-socket systems with GPU loads	2024-08-24 17:24:50 -07:00
Daniel Hiltgen	69be940bf6	gpu: Group GPU Library sets by variant (#6483 ) The recent cuda variant changes uncovered a bug in ByLibrary which failed to group by common variant for GPU types.	2024-08-23 15:11:56 -07:00
Michael Yang	9638c24c58	Merge pull request #5446 from ollama/mxyng/faq update faq	2024-08-23 14:05:59 -07:00
Michael Yang	bb362caf88	update faq	2024-08-23 13:37:21 -07:00
Patrick Devine	0c819e167b	convert safetensor adapters into GGUF (#6327 )	2024-08-23 11:29:56 -07:00
Daniel Hiltgen	7a1e1c1caf	gpu: Ensure driver version set before variant (#6480 ) During rebasing, the ordering was inverted causing the cuda version selection logic to break, with driver version being evaluated as zero incorrectly causing a downgrade to v11.	2024-08-23 11:21:12 -07:00
Daniel Hiltgen	0b03b9c32f	llm: Align cmake define for cuda no peer copy (#6455 ) Define changed recently and this slipped through the cracks with the old name.	2024-08-23 11:20:39 -07:00
Daniel Hiltgen	90ca84172c	Fix embeddings memory corruption (#6467 ) * Fix embeddings memory corruption The patch was leading to a buffer overrun corruption. Once removed though, parallism in server.cpp lead to hitting an assert due to slot/seq IDs being >= token count. To work around this, only use slot 0 for embeddings. * Fix embed integration test assumption The token eval count has changed with recent llama.cpp bumps (0.3.5+)	2024-08-22 14:51:42 -07:00
Michael Yang	6bd8a4b0a1	Merge pull request #6064 from ollama/mxyng/convert-llama3 convert: update llama conversion for llama3.1	2024-08-21 12:57:09 -07:00
Michael Yang	77903ab8b4	llama3.1	2024-08-21 11:49:31 -07:00
Michael Yang	e22286c9e1	Merge pull request #5365 from ollama/mxyng/convert-gemma2 convert gemma2	2024-08-21 11:48:43 -07:00
Michael Yang	107f695929	Merge pull request #4917 from ollama/mxyng/convert-bert convert bert model from safetensors	2024-08-21 11:48:29 -07:00
Michael Yang	4ecc70d3b4	Merge pull request #6386 from zwwhdls/fix-new-layer fix: chmod new layer to 0o644 when creating it	2024-08-21 10:58:45 -07:00
Michael Yang	3546bbd08c	convert gemma2	2024-08-20 17:27:51 -07:00
Michael Yang	beb49eef65	create bert models from cli	2024-08-20 17:27:34 -07:00
Michael Yang	5a28b9cf5f	bert	2024-08-20 17:27:34 -07:00
Daniel Hiltgen	a017cf2fea	Split rocm back out of bundle (#6432 ) We're over budget for github's maximum release artifact size with rocm + 2 cuda versions. This splits rocm back out as a discrete artifact, but keeps the layout so it can be extracted into the same location as the main bundle.	2024-08-20 07:26:38 -07:00
Daniel Hiltgen	19e5a890f7	CI: remove directories from dist dir before upload step (#6429 )	2024-08-19 15:19:21 -07:00
Daniel Hiltgen	f91c9e3709	CI: handle directories during checksum (#6427 )	2024-08-19 13:48:45 -07:00
Daniel Hiltgen	2df6905ede	Merge pull request #6424 from dhiltgen/cuda_v12 Fix overlapping artifact name on CI	2024-08-19 12:11:58 -07:00
Daniel Hiltgen	d8be22e47d	Fix overlapping artifact name on CI	2024-08-19 12:07:18 -07:00
Daniel Hiltgen	652c273f0e	Merge pull request #5049 from dhiltgen/cuda_v12 Cuda v12	2024-08-19 11:14:24 -07:00
Daniel Hiltgen	88e7705079	Merge pull request #6402 from rick-github/numParallel Override numParallel in pickBestPartialFitByLibrary() only if unset.	2024-08-19 11:07:22 -07:00
Daniel Hiltgen	f9e31da946	Review comments	2024-08-19 10:36:15 -07:00
Daniel Hiltgen	88bb9e3328	Adjust layout to bin+lib/ollama	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	3b19cdba2a	Remove Jetpack	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	927d98a6cd	Add windows cuda v12 + v11 support	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	f6c811b320	Enable cuda v12 flags	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	4fe3a556fa	Add cuda v12 variant and selection logic Based on compute capability and driver version, pick v12 or v11 cuda variants.	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	fc3b4cda89	Report GPU variant in log	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	d470ebe78b	Add Jetson cuda variants for arm This adds new variants for arm64 specific to Jetson platforms	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	c7bcb00319	Wire up ccache and pigz in the docker based build This should help speed things up a little	2024-08-19 09:38:53 -07:00
Daniel Hiltgen	74d45f0102	Refactor linux packaging This adjusts linux to follow a similar model to windows with a discrete archive (zip/tgz) to cary the primary executable, and dependent libraries. Runners are still carried as payloads inside the main binary Darwin retain the payload model where the go binary is fully self contained.	2024-08-19 09:38:53 -07:00
Jeffrey Morgan	9fddef3731	server: limit upload parts to 16 (#6411 )	2024-08-19 09:20:52 -07:00
Richard Lyons	885cf45087	Fix white space.	2024-08-18 03:07:16 +02:00
Richard Lyons	9352eeb752	Reset NumCtx.	2024-08-18 02:55:01 +02:00
Richard Lyons	0ad0e738cd	Override numParallel only if unset.	2024-08-18 01:43:26 +02:00
zwwhdls	bdc4308afb	fix: chmod new layer to 0o644 when creating it Signed-off-by: zwwhdls <zww@hdls.me>	2024-08-16 11:43:19 +08:00
Daniel Hiltgen	d29cd4c2ed	Merge pull request #6381 from eust-w/main fix: Add tooltip to system tray icon	2024-08-15 15:31:15 -07:00
eust-w	a84c05cf91	fix: Add tooltip to system tray icon - Updated setIcon method to include tooltip text for the system tray icon. - Added NIF_TIP flag and set the tooltip text using UTF16 encoding. Resolves: #6372	2024-08-16 06:00:12 +08:00
Michael Yang	e3d7f32af7	Merge pull request #6363 from ollama/mxyng/fix-noprune fix: noprune on pull	2024-08-15 12:20:38 -07:00
Michael Yang	3a75e74e34	only skip invalid json manifests	2024-08-15 10:29:14 -07:00
Michael Yang	237dccba1e	skip invalid manifest files	2024-08-14 16:55:45 -07:00
Michael Yang	b3f75fc812	fix noprune	2024-08-14 15:48:51 -07:00