Merge https://github.com/ollama/ollama

Signed-off-by: baalajimaestro <baalajimaestro@ptr.moe>
2024-09-15 23:49:24 +05:30 · 2024-09-15 23:49:24 +05:30 · 7f1565721c
commit 7f1565721c
parent 76c9dc57fd d889c6fd07
51 changed files with 1383 additions and 868 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -7,3 +7,5 @@ llm/llama.cpp
 .env
 .cache
 test_data
 llm/build
 llama/build
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -102,8 +102,8 @@ jobs:
        with:
          name: generate-windows-cpu
          path: |
-            llm/build/**/bin/*
+            build/**/*
-            llm/build/**/*.a
+            build/**/*.a
            dist/windows-amd64/**
  # ROCm generation step
@ -176,7 +176,7 @@ jobs:
        with:
          name: generate-windows-rocm
          path: |
-            llm/build/**/bin/*
+            build/**/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
@ -265,7 +265,7 @@ jobs:
        with:
          name: generate-windows-cuda-${{ matrix.cuda.version }}
          path: |
-            llm/build/**/bin/*
+            build/**/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
@ -338,7 +338,7 @@ jobs:
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
-      - run: dir llm/build
+      - run: dir build
      - run: |
          $gopath=(get-command go).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
@ -359,9 +359,7 @@ jobs:
    environment: release
    runs-on: linux
    env:
      OLLAMA_SKIP_MANIFEST_CREATE: '1'
      BUILD_ARCH: amd64
      PUSH: '1'
    steps:
      - uses: actions/checkout@v4
        with:
@ -369,14 +367,8 @@ jobs:
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: |
          ./scripts/build_linux.sh
          ./scripts/build_docker.sh
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-amd64
@ -390,9 +382,7 @@ jobs:
    environment: release
    runs-on: linux-arm64
    env:
      OLLAMA_SKIP_MANIFEST_CREATE: '1'
      BUILD_ARCH: arm64
      PUSH: '1'
    steps:
      - uses: actions/checkout@v4
        with:
@ -421,14 +411,8 @@ jobs:
          sudo usermod -aG docker $USER
          sudo apt-get install acl
          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: |
          ./scripts/build_linux.sh
          ./scripts/build_docker.sh
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-arm64
@ -436,6 +420,181 @@ jobs:
            dist/*linux*
            !dist/*-cov
  # Container image build
  build-linux:
    environment: release
    strategy:
      matrix:
        runner:
          - linux
          - linux-arm64
    runs-on: ${{ matrix.runner }}
    env:
      FINAL_IMAGE_REPO: ollama/ollama
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - name: 'Install Docker'
        if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
        run: |
          sudo apt-get update
          sudo apt-get install -y ca-certificates curl
          sudo install -m 0755 -d /etc/apt/keyrings
          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
          sudo chmod a+r /etc/apt/keyrings/docker.asc
          echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
            $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
            sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
          sudo apt-get update
          sudo apt-get install -y docker-ce docker-ce-cli containerd.io
          sudo usermod -aG docker $USER
          sudo apt-get install acl
          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.FINAL_IMAGE_REPO }}
          flavor: |
            latest=false
          tags: |
            type=ref,event=tag
            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
            type=semver,pattern={{version}}
      - name: Set Version
        shell: bash
        run: |
          machine=$(uname -m)
          case ${machine} in
            x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
            aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
          esac >>$GITHUB_ENV
          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - name: Build and push by digest
        id: build
        uses: docker/build-push-action@v6
        with:
          context: "."
          platforms: linux/${{ env.ARCH }}
          build-args: |
            GOFLAGS
          outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
      - name: Export digest
        run: |
          mkdir -p /tmp/digests
          digest="${{ steps.build.outputs.digest }}"
          touch "/tmp/digests/${digest#sha256:}"
      - name: Upload digest
        uses: actions/upload-artifact@v4
        with:
          name: digests-${{ env.PLATFORM_PAIR }}
          path: /tmp/digests/*
          if-no-files-found: error
          retention-days: 1
  merge:
    environment: release
    runs-on: linux
    needs:
      - build-linux
    env:
      FINAL_IMAGE_REPO: ollama/ollama
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - name: Download digests
        uses: actions/download-artifact@v4
        with:
          path: /tmp/digests
          pattern: digests-*
          merge-multiple: true
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.FINAL_IMAGE_REPO }}
          flavor: |
            latest=false
          tags: |
            type=ref,event=tag
            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
            type=semver,pattern={{version}}
      - name: Set Version
        shell: bash
        run: |
          machine=$(uname -m)
          case ${machine} in
            x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
            aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
          esac >>$GITHUB_ENV
          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - name: Create manifest list and push
        working-directory: /tmp/digests
        run: |
          docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
            $(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
      - name: Inspect image
        run: |
          docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}          
  build-linux-rocm:
    environment: release
    runs-on: linux
    env:
      FINAL_IMAGE_REPO: ollama/ollama
      ARCH: amd64
      PLATFORM_PAIR: linux-amd64
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.FINAL_IMAGE_REPO }}
          flavor: |
            latest=false
          tags: |
            type=ref,event=tag
            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
            type=semver,pattern={{version}}
      - name: Set Version
        shell: bash
        run: |
          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - name: Build and push by digest
        id: build
        uses: docker/build-push-action@v6
        with:
          context: "."
          target: runtime-rocm
          build-args: |
            GOFLAGS
          tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm,${{ env.FINAL_IMAGE_REPO }}:rocm
          push: true
  # Aggregate all the assets and ship a release
  release:
    needs:
@ -448,8 +607,6 @@ jobs:
    permissions:
      contents: write
    env:
      OLLAMA_SKIP_IMAGE_BUILD: '1'
      PUSH: '1'
      GH_TOKEN: ${{ github.token }}
    steps:
      - uses: actions/checkout@v4
@ -458,12 +615,6 @@ jobs:
        run: |
          echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
          echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: ./scripts/build_docker.sh
      - name: Retrieve built artifact
        uses: actions/download-artifact@v4
        with:
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -81,12 +81,6 @@ jobs:
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
        name: 'Unix Go Generate'
      - run: go build .
      - uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
          path: |
            llm/build/**/bin/*
            llm/build/**/*.a
  generate-cuda:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
@ -114,12 +108,6 @@ jobs:
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
      - uses: actions/upload-artifact@v4
        with:
          name: cuda-${{ matrix.cuda-version }}-libraries
          path: |
            llm/build/**/bin/*
            dist/windows-amd64/**
  generate-rocm:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
@ -147,12 +135,6 @@ jobs:
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
      - uses: actions/upload-artifact@v4
        with:
          name: rocm-${{ matrix.rocm-version }}-libraries
          path: |
            llm/build/**/bin/*
            dist/windows-amd64/**
  # ROCm generation step
  generate-windows-rocm:
@ -189,7 +171,6 @@ jobs:
        name: go generate
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
      # TODO - do we need any artifacts?
  # CUDA generation step
  generate-windows-cuda:
@ -231,7 +212,6 @@ jobs:
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
      # TODO - do we need any artifacts?
  lint:
    strategy:
@ -263,14 +243,6 @@ jobs:
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
      - run: |
          mkdir -p llm/build/linux/$ARCH/stub/bin
          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
      - run: |
          mkdir -p llm/build/darwin/$ARCH/stub/bin
          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
        if: ${{ startsWith(matrix.os, 'macos-') }}
      - uses: golangci/golangci-lint-action@v6
        with:
          args: --timeout 8m0s -v
@ -301,23 +273,10 @@ jobs:
          cache: true
      - run: |
          case ${{ matrix.arch }} in
-            amd64) echo ARCH=x86_64 ;;
+            amd64) echo ARCH=amd64 ;;
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
      - run: |
          mkdir -p llm/build/linux/$ARCH/stub/bin
          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
      - run: |
          mkdir -p llm/build/darwin/$ARCH/stub/bin
          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
        if: ${{ startsWith(matrix.os, 'macos-') }}
        shell: bash
      - run: go generate ./...
      - run: go build
      - run: go test -v ./...
      - uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-binaries
          path: ollama
--- a/.gitignore
+++ b/.gitignore
@ -12,4 +12,7 @@ ggml-metal.metal
 test_data
 *.crt
 llm/build
 build/*/*/*
 !build/**/placeholder
 llama/build
 __debug_bin*
--- a/README.md
+++ b/README.md
@ -312,6 +312,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
 ### Terminal
@ -336,6 +337,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [podman-ollama](https://github.com/ericcurtin/podman-ollama)
 - [gollama](https://github.com/sammcj/gollama)
 - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
 - [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
 ### Apple Vision Pro
 - [Enchanted](https://github.com/AugustDev/enchanted)
@ -358,6 +360,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
 - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
 - [crewAI](https://github.com/crewAIInc/crewAI)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
@ -427,6 +430,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
 - [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
 - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
 - [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
 ### Supported backends
--- a/build/darwin/amd64/placeholder
+++ b/build/darwin/amd64/placeholder
@ -0,0 +1 @@
 This is here to make sure the build/ directory exists for the go:embed command
--- a/build/darwin/arm64/placeholder
+++ b/build/darwin/arm64/placeholder
@ -0,0 +1 @@
 This is here to make sure the build/ directory exists for the go:embed command
--- a/build/embed_darwin_amd64.go
+++ b/build/embed_darwin_amd64.go
@ -0,0 +1,8 @@
 package build
 import "embed"
 // Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
 //go:embed darwin/amd64/*
 var EmbedFS embed.FS
--- a/build/embed_darwin_arm64.go
+++ b/build/embed_darwin_arm64.go
@ -0,0 +1,8 @@
 package build
 import "embed"
 // Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
 //go:embed darwin/arm64/*
 var EmbedFS embed.FS
--- a/build/embed_linux.go
+++ b/build/embed_linux.go
@ -0,0 +1,6 @@
 package build
 import "embed"
 //go:embed linux/*
 var EmbedFS embed.FS
--- a/build/embed_unused.go
+++ b/build/embed_unused.go
@ -0,0 +1,8 @@
 //go:build !linux && !darwin
 package build
 import "embed"
 // unused on windows
 var EmbedFS embed.FS
--- a/build/linux/amd64/placeholder
+++ b/build/linux/amd64/placeholder
@ -0,0 +1 @@
 This is here to make sure the build/ directory exists for the go:embed command
--- a/build/linux/arm64/placeholder
+++ b/build/linux/arm64/placeholder
@ -0,0 +1 @@
 This is here to make sure the build/ directory exists for the go:embed command
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -2,6 +2,7 @@ package cmd
 import (
 	"archive/zip"
 	"bufio"
 	"bytes"
 	"context"
 	"crypto/ed25519"
@ -21,6 +22,7 @@ import (
 	"regexp"
 	"runtime"
 	"slices"
 	"strconv"
 	"strings"
 	"sync/atomic"
 	"syscall"
@ -344,6 +346,39 @@ func (w *progressWriter) Write(p []byte) (n int, err error) {
 	return len(p), nil
 }
 func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.StopAndClear()
 	spinner := progress.NewSpinner("")
 	p.Add("", spinner)
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
 	}
 	req := &api.GenerateRequest{
 		Model:     opts.Model,
 		KeepAlive: opts.KeepAlive,
 	}
 	return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
 }
 func StopHandler(cmd *cobra.Command, args []string) error {
 	opts := &runOptions{
 		Model:     args[0],
 		KeepAlive: &api.Duration{Duration: 0},
 	}
 	if err := loadOrUnloadModel(cmd, opts); err != nil {
 		if strings.Contains(err.Error(), "not found") {
 			return fmt.Errorf("couldn't find model \"%s\" to stop", args[0])
 		}
 	}
 	return nil
 }
 func RunHandler(cmd *cobra.Command, args []string) error {
 	interactive := true
@ -422,7 +457,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	opts.ParentModel = info.Details.ParentModel
 	if interactive {
-		if err := loadModel(cmd, &opts); err != nil {
+		if err := loadOrUnloadModel(cmd, &opts); err != nil {
 			return err
 		}
@ -578,7 +613,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
 	table.SetHeaderLine(false)
 	table.SetBorder(false)
 	table.SetNoWhiteSpace(true)
-	table.SetTablePadding("\t")
+	table.SetTablePadding("    ")
 	table.AppendBulk(data)
 	table.Render()
@ -613,7 +648,15 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
 				cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
 				procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
 			}
-			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
+
 			var until string
 			delta := time.Since(m.ExpiresAt)
 			if delta > 0 {
 				until = "Stopping..."
 			} else {
 				until = format.HumanTime(m.ExpiresAt, "Never")
 			}
 			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
 		}
 	}
@ -624,7 +667,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
 	table.SetHeaderLine(false)
 	table.SetBorder(false)
 	table.SetNoWhiteSpace(true)
-	table.SetTablePadding("\t")
+	table.SetTablePadding("    ")
 	table.AppendBulk(data)
 	table.Render()
@ -720,125 +763,89 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return nil
 	}
-	showInfo(resp)
+	return showInfo(resp, os.Stdout)
 	return nil
 }
-func showInfo(resp *api.ShowResponse) {
+func showInfo(resp *api.ShowResponse, w io.Writer) error {
-	modelData := [][]string{
+	tableRender := func(header string, rows func() [][]string) {
-		{"parameters", resp.Details.ParameterSize},
+		fmt.Fprintln(w, " ", header)
-		{"quantization", resp.Details.QuantizationLevel},
+		table := tablewriter.NewWriter(w)
-	}
+		table.SetAlignment(tablewriter.ALIGN_LEFT)
-	if resp.ModelInfo != nil {
+		table.SetBorder(false)
-		arch := resp.ModelInfo["general.architecture"].(string)
+		table.SetNoWhiteSpace(true)
-		modelData = append(modelData,
+		table.SetTablePadding("    ")
-			[]string{"arch", arch},
+
-			[]string{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
+		switch header {
-			[]string{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
+		case "Template", "System", "License":
-		)
+			table.SetColWidth(100)
 		}
 		table.AppendBulk(rows())
 		table.Render()
 		fmt.Fprintln(w)
 	}
-	mainTableData := [][]string{
+	tableRender("Model", func() (rows [][]string) {
-		{"Model"},
+		if resp.ModelInfo != nil {
-		{renderSubTable(modelData, false)},
+			arch := resp.ModelInfo["general.architecture"].(string)
-	}
+			rows = append(rows, []string{"", "architecture", arch})
 			rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ModelInfo["general.parameter_count"].(float64)))})
 			rows = append(rows, []string{"", "context length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64), 'f', -1, 64)})
 			rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64), 'f', -1, 64)})
 		} else {
 			rows = append(rows, []string{"", "architecture", resp.Details.Family})
 			rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
 		}
 		rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel})
 		return
 	})
 	if resp.ProjectorInfo != nil {
-		projectorData := [][]string{
+		tableRender("Projector", func() (rows [][]string) {
-			{"arch", "clip"},
+			arch := resp.ProjectorInfo["general.architecture"].(string)
-			{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
+			rows = append(rows, []string{"", "architecture", arch})
-		}
+			rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))})
-
+			rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.embedding_length", arch)].(float64), 'f', -1, 64)})
-		if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
+			rows = append(rows, []string{"", "dimensions", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.projection_dim", arch)].(float64), 'f', -1, 64)})
-			projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
+			return
-		}
+		})
 		projectorData = append(projectorData,
 			[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
 			[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
 		)
 		mainTableData = append(mainTableData,
 			[]string{"Projector"},
 			[]string{renderSubTable(projectorData, false)},
 		)
 	}
 	if resp.Parameters != "" {
-		mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
+		tableRender("Parameters", func() (rows [][]string) {
 			scanner := bufio.NewScanner(strings.NewReader(resp.Parameters))
 			for scanner.Scan() {
 				if text := scanner.Text(); text != "" {
 					rows = append(rows, append([]string{""}, strings.Fields(text)...))
 				}
 			}
 			return
 		})
 	}
 	head := func(s string, n int) (rows [][]string) {
 		scanner := bufio.NewScanner(strings.NewReader(s))
 		for scanner.Scan() && (len(rows) < n || n < 0) {
 			if text := scanner.Text(); text != "" {
 				rows = append(rows, []string{"", strings.TrimSpace(text)})
 			}
 		}
 		return
 	}
 	if resp.System != "" {
-		mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
+		tableRender("System", func() [][]string {
 			return head(resp.System, 2)
 		})
 	}
 	if resp.License != "" {
-		mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
+		tableRender("License", func() [][]string {
 			return head(resp.License, 2)
 		})
 	}
-	table := tablewriter.NewWriter(os.Stdout)
+	return nil
 	table.SetAutoWrapText(false)
 	table.SetBorder(false)
 	table.SetAlignment(tablewriter.ALIGN_LEFT)
 	for _, v := range mainTableData {
 		table.Append(v)
 	}
 	table.Render()
 }
 func renderSubTable(data [][]string, file bool) string {
 	var buf bytes.Buffer
 	table := tablewriter.NewWriter(&buf)
 	table.SetAutoWrapText(!file)
 	table.SetBorder(false)
 	table.SetNoWhiteSpace(true)
 	table.SetTablePadding("\t")
 	table.SetAlignment(tablewriter.ALIGN_LEFT)
 	for _, v := range data {
 		table.Append(v)
 	}
 	table.Render()
 	renderedTable := buf.String()
 	lines := strings.Split(renderedTable, "\n")
 	for i, line := range lines {
 		lines[i] = "\t" + line
 	}
 	return strings.Join(lines, "\n")
 }
 func twoLines(s string) [][]string {
 	lines := strings.Split(s, "\n")
 	res := [][]string{}
 	count := 0
 	for _, line := range lines {
 		line = strings.TrimSpace(line)
 		if line != "" {
 			count++
 			res = append(res, []string{line})
 			if count == 2 {
 				return res
 			}
 		}
 	}
 	return res
 }
 func formatParams(s string) string {
 	lines := strings.Split(s, "\n")
 	table := [][]string{}
 	for _, line := range lines {
 		table = append(table, strings.Fields(line))
 	}
 	return renderSubTable(table, false)
 }
 func CopyHandler(cmd *cobra.Command, args []string) error {
@ -1328,6 +1335,15 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
 	stopCmd := &cobra.Command{
 		Use:     "stop MODEL",
 		Short:   "Stop a running model",
 		Args:    cobra.ExactArgs(1),
 		PreRunE: checkServerHeartbeat,
 		RunE:    StopHandler,
 	}
 	serveCmd := &cobra.Command{
 		Use:     "serve",
 		Aliases: []string{"start"},
@ -1395,6 +1411,7 @@ func NewCLI() *cobra.Command {
 		createCmd,
 		showCmd,
 		runCmd,
 		stopCmd,
 		pullCmd,
 		pushCmd,
 		listCmd,
@ -1434,6 +1451,7 @@ func NewCLI() *cobra.Command {
 		createCmd,
 		showCmd,
 		runCmd,
 		stopCmd,
 		pullCmd,
 		pushCmd,
 		listCmd,
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@ -0,0 +1,206 @@
 package cmd
 import (
 	"bytes"
 	"os"
 	"path/filepath"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/api"
 )
 func TestShowInfo(t *testing.T) {
 	t.Run("bare details", func(t *testing.T) {
 		var b bytes.Buffer
 		if err := showInfo(&api.ShowResponse{
 			Details: api.ModelDetails{
 				Family:            "test",
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
 		}, &b); err != nil {
 			t.Fatal(err)
 		}
 		expect := `  Model
    architecture    test    
    parameters      7B      
    quantization    FP16    
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
 		}
 	})
 	t.Run("bare model info", func(t *testing.T) {
 		var b bytes.Buffer
 		if err := showInfo(&api.ShowResponse{
 			ModelInfo: map[string]any{
 				"general.architecture":    "test",
 				"general.parameter_count": float64(7_000_000_000),
 				"test.context_length":     float64(0),
 				"test.embedding_length":   float64(0),
 			},
 			Details: api.ModelDetails{
 				Family:            "test",
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
 		}, &b); err != nil {
 			t.Fatal(err)
 		}
 		expect := `  Model
    architecture        test    
    parameters          7B      
    context length      0       
    embedding length    0       
    quantization        FP16    
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
 		}
 	})
 	t.Run("parameters", func(t *testing.T) {
 		var b bytes.Buffer
 		if err := showInfo(&api.ShowResponse{
 			Details: api.ModelDetails{
 				Family:            "test",
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
 			Parameters: `
 			stop never
 			stop gonna
 			stop give
 			stop you
 			stop up
 			temperature 99`,
 		}, &b); err != nil {
 			t.Fatal(err)
 		}
 		expect := `  Model
    architecture    test    
    parameters      7B      
    quantization    FP16    
  Parameters
    stop           never    
    stop           gonna    
    stop           give     
    stop           you      
    stop           up       
    temperature    99       
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
 		}
 	})
 	t.Run("project info", func(t *testing.T) {
 		var b bytes.Buffer
 		if err := showInfo(&api.ShowResponse{
 			Details: api.ModelDetails{
 				Family:            "test",
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
 			ProjectorInfo: map[string]any{
 				"general.architecture":         "clip",
 				"general.parameter_count":      float64(133_700_000),
 				"clip.vision.embedding_length": float64(0),
 				"clip.vision.projection_dim":   float64(0),
 			},
 		}, &b); err != nil {
 			t.Fatal(err)
 		}
 		expect := `  Model
    architecture    test    
    parameters      7B      
    quantization    FP16    
  Projector
    architecture        clip       
    parameters          133.70M    
    embedding length    0          
    dimensions          0          
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
 		}
 	})
 	t.Run("system", func(t *testing.T) {
 		var b bytes.Buffer
 		if err := showInfo(&api.ShowResponse{
 			Details: api.ModelDetails{
 				Family:            "test",
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
 			System: `You are a pirate!
 Ahoy, matey!
 Weigh anchor!
 			`,
 		}, &b); err != nil {
 			t.Fatal(err)
 		}
 		expect := `  Model
    architecture    test    
    parameters      7B      
    quantization    FP16    
  System
    You are a pirate!    
    Ahoy, matey!         
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
 		}
 	})
 	t.Run("license", func(t *testing.T) {
 		var b bytes.Buffer
 		license, err := os.ReadFile(filepath.Join("..", "LICENSE"))
 		if err != nil {
 			t.Fatal(err)
 		}
 		if err := showInfo(&api.ShowResponse{
 			Details: api.ModelDetails{
 				Family:            "test",
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
 			License: string(license),
 		}, &b); err != nil {
 			t.Fatal(err)
 		}
 		expect := `  Model
    architecture    test    
    parameters      7B      
    quantization    FP16    
  License
    MIT License             
    Copyright (c) Ollama    
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
 		}
 	})
 }
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@ -18,7 +18,6 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/types/errtypes"
 )
@ -31,26 +30,6 @@ const (
 	MultilineSystem
 )
 func loadModel(cmd *cobra.Command, opts *runOptions) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.StopAndClear()
 	spinner := progress.NewSpinner("")
 	p.Add("", spinner)
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
 	}
 	chatReq := &api.ChatRequest{
 		Model:     opts.Model,
 		KeepAlive: opts.KeepAlive,
 	}
 	return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil })
 }
 func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 	usage := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
@ -217,7 +196,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			opts.Model = args[1]
 			opts.Messages = []api.Message{}
 			fmt.Printf("Loading model '%s'\n", opts.Model)
-			if err := loadModel(cmd, &opts); err != nil {
+			if err := loadOrUnloadModel(cmd, &opts); err != nil {
 				return err
 			}
 			continue
@ -371,7 +350,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				switch args[1] {
 				case "info":
-					showInfo(resp)
+					_ = showInfo(resp, os.Stderr)
 				case "license":
 					if resp.License == "" {
 						fmt.Println("No license was specified for this model.")
--- a/convert/convert.go
+++ b/convert/convert.go
@ -208,14 +208,18 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		return err
 	}
-	if vocabSize := int(p.VocabSize); vocabSize > len(t.Vocabulary.Tokens) {
+	vocabSize := int(p.VocabSize)
-		slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", p.VocabSize, "actual", len(t.Vocabulary.Tokens))
+	switch {
 	case vocabSize > len(t.Vocabulary.Tokens):
 		slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
 		for i := range vocabSize - len(t.Vocabulary.Tokens) {
 			t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
 			t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
 			t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
 		}
-	} else {
+	case vocabSize < len(t.Vocabulary.Tokens):
 		return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
 	default:
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
--- a/docs/api.md
+++ b/docs/api.md
@ -69,7 +69,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama3.1",
  "prompt": "Why is the sky blue?"
 }'
 ```
@ -80,7 +80,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.1",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "response": "The",
  "done": false
@ -102,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.1",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "",
  "done": true,
@ -124,7 +124,7 @@ A response can be received in one reply when streaming is off.
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama3.1",
  "prompt": "Why is the sky blue?",
  "stream": false
 }'
@ -136,7 +136,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.1",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@ -194,7 +194,7 @@ curl http://localhost:11434/api/generate -d '{
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama3.1",
  "prompt": "What color is the sky at different times of the day? Respond using JSON",
  "format": "json",
  "stream": false
@ -205,7 +205,7 @@ curl http://localhost:11434/api/generate -d '{
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.1",
  "created_at": "2023-11-09T21:07:55.186497Z",
  "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
  "done": true,
@ -327,7 +327,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama3.1",
  "prompt": "Why is the sky blue?",
  "stream": false,
  "options": {
@ -368,7 +368,7 @@ curl http://localhost:11434/api/generate -d '{
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.1",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@ -390,7 +390,7 @@ If an empty prompt is provided, the model will be loaded into memory.
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3"
+  "model": "llama3.1"
 }'
 ```
@ -400,7 +400,7 @@ A single JSON object is returned:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.1",
  "created_at": "2023-12-18T19:52:07.071755Z",
  "response": "",
  "done": true
@ -445,7 +445,7 @@ Send a chat message with a streaming response.
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama3.1",
  "messages": [
    {
      "role": "user",
@ -461,7 +461,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.1",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@ -476,7 +476,7 @@ Final response:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.1",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 4883583458,
@ -494,7 +494,7 @@ Final response:
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama3.1",
  "messages": [
    {
      "role": "user",
@ -509,7 +509,7 @@ curl http://localhost:11434/api/chat -d '{
 ```json
 {
-  "model": "registry.ollama.ai/library/llama3:latest",
+  "model": "llama3.1",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@ -533,7 +533,7 @@ Send a chat message with a conversation history. You can use this same approach
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama3.1",
  "messages": [
    {
      "role": "user",
@ -557,7 +557,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.1",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@ -571,7 +571,7 @@ Final response:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.1",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 8113331500,
@ -629,7 +629,7 @@ curl http://localhost:11434/api/chat -d '{
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama3.1",
  "messages": [
    {
      "role": "user",
@ -647,7 +647,7 @@ curl http://localhost:11434/api/chat -d '{
 ```json
 {
-  "model": "registry.ollama.ai/library/llama3:latest",
+  "model": "llama3.1",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@ -904,7 +904,7 @@ Show information about a model including details, modelfile, template, parameter
 ```shell
 curl http://localhost:11434/api/show -d '{
-  "name": "llama3"
+  "name": "llama3.1"
 }'
 ```
@ -965,7 +965,7 @@ Copy a model. Creates a model with another name from an existing model.
 ```shell
 curl http://localhost:11434/api/copy -d '{
-  "source": "llama3",
+  "source": "llama3.1",
  "destination": "llama3-backup"
 }'
 ```
@ -1020,7 +1020,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "name": "llama3"
+  "name": "llama3.1"
 }'
 ```
--- a/docs/faq.md
+++ b/docs/faq.md
@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama3.1",
  "prompt": "Why is the sky blue?",
  "options": {
    "num_ctx": 4096
@ -247,12 +247,12 @@ The `keep_alive` parameter can be set to:
 For example, to preload a model and leave it in memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": -1}'
 ```
 To unload the model and free up memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": 0}'
 ```
 Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@ -11,7 +11,7 @@ A model file is the blueprint to create and share models with Ollama.
 - [Examples](#examples)
 - [Instructions](#instructions)
  - [FROM (Required)](#from-required)
-    - [Build from llama3.1](#build-from-llama31)
+    - [Build from existing model](#build-from-existing-model)
    - [Build from a Safetensors model](#build-from-a-safetensors-model)
    - [Build from a GGUF file](#build-from-a-gguf-file)
  - [PARAMETER](#parameter)
@ -50,7 +50,7 @@ INSTRUCTION arguments
 An example of a `Modelfile` creating a mario blueprint:
 ```modelfile
-FROM llama3
+FROM llama3.1
 # sets the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
@ -72,10 +72,10 @@ More examples are available in the [examples directory](../examples).
 To view the Modelfile of a given model, use the `ollama show --modelfile` command.
  ```bash
-  > ollama show --modelfile llama3
+  > ollama show --modelfile llama3.1
  # Modelfile generated by "ollama show"
  # To build a new Modelfile based on this one, replace the FROM line with:
-  # FROM llama3:latest
+  # FROM llama3.1:latest
  FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
  TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
@ -100,7 +100,7 @@ The `FROM` instruction defines the base model to use when creating a model.
 FROM <model name>:<tag>
 ```
-#### Build from llama3.1
+#### Build from existing model
 ```modelfile
 FROM llama3.1
--- a/docs/openai.md
+++ b/docs/openai.md
@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
            'content': 'Say this is a test',
        }
    ],
-    model='llama3',
+    model='llama3.1',
 )
 response = client.chat.completions.create(
@ -46,13 +46,13 @@ response = client.chat.completions.create(
 )
 completion = client.completions.create(
-    model="llama3",
+    model="llama3.1",
    prompt="Say this is a test",
 )
 list_completion = client.models.list()
-model = client.models.retrieve("llama3")
+model = client.models.retrieve("llama3.1")
 embeddings = client.embeddings.create(
    model="all-minilm",
@ -74,7 +74,7 @@ const openai = new OpenAI({
 const chatCompletion = await openai.chat.completions.create({
    messages: [{ role: 'user', content: 'Say this is a test' }],
-    model: 'llama3',
+    model: 'llama3.1',
 })
 const response = await openai.chat.completions.create({
@ -94,13 +94,13 @@ const response = await openai.chat.completions.create({
 })
 const completion = await openai.completions.create({
-    model: "llama3",
+    model: "llama3.1",
    prompt: "Say this is a test.",
 })
 const listCompletion = await openai.models.list()
-const model = await openai.models.retrieve("llama3")
+const model = await openai.models.retrieve("llama3.1")
 const embedding = await openai.embeddings.create({
  model: "all-minilm",
@ -114,7 +114,7 @@ const embedding = await openai.embeddings.create({
 curl http://localhost:11434/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "llama3",
+        "model": "llama3.1",
        "messages": [
            {
                "role": "system",
@ -154,13 +154,13 @@ curl http://localhost:11434/v1/chat/completions \
 curl http://localhost:11434/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "llama3",
+        "model": "llama3.1",
        "prompt": "Say this is a test"
    }'
 curl http://localhost:11434/v1/models
-curl http://localhost:11434/v1/models/llama3
+curl http://localhost:11434/v1/models/llama3.1
 curl http://localhost:11434/v1/embeddings \
    -H "Content-Type: application/json" \
@ -274,7 +274,7 @@ curl http://localhost:11434/v1/embeddings \
 Before using a model, pull it locally `ollama pull`:
 ```shell
-ollama pull llama3
+ollama pull llama3.1
 ```
 ### Default model names
@ -282,7 +282,7 @@ ollama pull llama3
 For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
 ```
-ollama cp llama3 gpt-3.5-turbo
+ollama cp llama3.1 gpt-3.5-turbo
 ```
 Afterwards, this new model name can be specified the `model` field:
--- a/docs/template.md
+++ b/docs/template.md
@ -33,7 +33,7 @@ Omitting a template in these models puts the responsibility of correctly templat
 To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
 ```dockerfile
-FROM llama3
+FROM llama3.1
 TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@ -91,6 +91,17 @@ If none of those resolve the problem, gather additional information and file an
 - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
 ## AMD GPU Discovery
 On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
 When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
 If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
 - `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems
 - `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
 - Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`
 ## Windows Terminal Errors
 Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly.  This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect`  To resolve this problem, please update to Win 10 22H1 or newer.
--- a/docs/windows.md
+++ b/docs/windows.md
@ -29,7 +29,7 @@ Ollama uses unicode characters for progress indication, which may render as unkn
 Here's a quick example showing API access from `powershell`
 ```powershell
-(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
+(Invoke-WebRequest -method POST -Body '{"model":"llama3.1", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
 ```
 ## Troubleshooting
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -179,53 +179,6 @@ var (
 	HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
 )
 func RunnersDir() (p string) {
 	if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
 		return p
 	}
 	if runtime.GOOS != "windows" {
 		return
 	}
 	defer func() {
 		if p == "" {
 			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
 		}
 	}()
 	// On Windows we do not carry the payloads inside the main executable
 	exe, err := os.Executable()
 	if err != nil {
 		return
 	}
 	cwd, err := os.Getwd()
 	if err != nil {
 		return
 	}
 	var paths []string
 	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), LibRelativeToExe()), cwd} {
 		paths = append(paths,
 			root,
 			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
 			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
 		)
 	}
 	// Try a few variations to improve developer experience when building from source in the local tree
 	for _, path := range paths {
 		candidate := filepath.Join(path, "lib", "ollama", "runners")
 		if _, err := os.Stat(candidate); err == nil {
 			p = candidate
 			break
 		}
 	}
 	return p
 }
 func Uint(key string, defaultValue uint) func() uint {
 	return func() uint {
 		if s := Var(key); s != "" {
@ -290,10 +243,22 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
 		// Informational
 		"HTTP_PROXY":  {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
 		"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
 		"NO_PROXY":    {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
 	}
 	if runtime.GOOS != "windows" {
 		// Windows environment variables are case-insensitive so there's no need to duplicate them
 		ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"}
 		ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"}
 		ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"}
 	}
 	if runtime.GOOS != "darwin" {
 		ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
 		ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
@ -302,6 +267,7 @@ func AsMap() map[string]EnvVar {
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
 		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
 	}
 	return ret
 }
--- a/examples/langchain-python-rag-privategpt/requirements.txt
+++ b/examples/langchain-python-rag-privategpt/requirements.txt
@ -1,6 +1,6 @@
 langchain==0.0.274
 gpt4all==1.0.8
-chromadb==0.4.7
+chromadb==0.5.0
 llama-cpp-python==0.1.81
 urllib3==2.0.4
 PyMuPDF==1.23.5
@ -12,4 +12,4 @@ pandoc==2.3
 pypandoc==1.11
 tqdm==4.66.1
 sentence_transformers==2.2.2
-numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability
+numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability
--- a/examples/python-loganalysis/Modelfile
+++ b/examples/python-loganalysis/Modelfile
@ -4,5 +4,5 @@ SYSTEM """
 You are a log file analyzer. You will receive a set of lines from a log file for some software application, find the errors and other interesting aspects of the logs, and explain them so a new user can understand what they mean. If there are any steps they can do to resolve them, list the steps in your answer.
 """
-PARAMETER TEMPERATURE 0.3
+PARAMETER temperature 0.3
--- a/examples/python-loganalysis/readme.md
+++ b/examples/python-loganalysis/readme.md
@ -21,6 +21,8 @@ You can try this with the `logtest.logfile` file included in this directory.
 2. Install the Python Requirements.
   ```bash
   python3 -m venv .venv
   source .venv/bin/activate
   pip install -r requirements.txt
   ```
--- a/examples/python-loganalysis/requirements.txt
+++ b/examples/python-loganalysis/requirements.txt
@ -1 +1 @@
-Requests==2.31.0
+Requests>=2.32.3
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
@ -359,6 +360,10 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 	if len(resp) == 0 {
 		slog.Info("no compatible amdgpu devices detected")
 	}
 	if err := verifyKFDDriverAccess(); err != nil {
 		slog.Error("amdgpu devices detected but permission problems block access", "error", err)
 		return nil
 	}
 	return resp
 }
@ -455,3 +460,19 @@ func getFreeMemory(usedFile string) (uint64, error) {
 	}
 	return usedMemory, nil
 }
 func verifyKFDDriverAccess() error {
 	// Verify we have permissions - either running as root, or we have group access to the driver
 	fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666)
 	if err != nil {
 		if errors.Is(err, fs.ErrPermission) {
 			return fmt.Errorf("permissions not set up properly.  Either run ollama as root, or add you user account to the render group. %w", err)
 		} else if errors.Is(err, fs.ErrNotExist) {
 			// Container runtime failure?
 			return fmt.Errorf("kfd driver not loaded.  If running in a container, remember to include '--device /dev/kfd --device /dev/dri'")
 		}
 		return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
 	}
 	fd.Close()
 	return nil
 }
--- a/gpu/assets.go
+++ b/gpu/assets.go
@ -1,148 +0,0 @@
 package gpu
 import (
 	"errors"
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"syscall"
 	"time"
 	"github.com/ollama/ollama/envconfig"
 )
 var (
 	lock        sync.Mutex
 	payloadsDir = ""
 )
 func PayloadsDir() (string, error) {
 	lock.Lock()
 	defer lock.Unlock()
 	var err error
 	if payloadsDir == "" {
 		runnersDir := envconfig.RunnersDir()
 		if runnersDir != "" {
 			payloadsDir = runnersDir
 			return payloadsDir, nil
 		}
 		// The remainder only applies on non-windows where we still carry payloads in the main executable
 		cleanupTmpDirs()
 		tmpDir := envconfig.TmpDir()
 		if tmpDir == "" {
 			tmpDir, err = os.MkdirTemp("", "ollama")
 			if err != nil {
 				return "", fmt.Errorf("failed to generate tmp dir: %w", err)
 			}
 		} else {
 			err = os.MkdirAll(tmpDir, 0o755)
 			if err != nil {
 				return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
 			}
 		}
 		// Track our pid so we can clean up orphaned tmpdirs
 		n := filepath.Join(tmpDir, "ollama.pid")
 		if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
 			return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
 		}
 		// We create a distinct subdirectory for payloads within the tmpdir
 		// This will typically look like /tmp/ollama3208993108/runners on linux
 		payloadsDir = filepath.Join(tmpDir, "runners")
 	}
 	return payloadsDir, nil
 }
 // Best effort to clean up prior tmpdirs
 func cleanupTmpDirs() {
 	matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
 	if err != nil {
 		return
 	}
 	for _, match := range matches {
 		raw, err := os.ReadFile(match)
 		if errors.Is(err, os.ErrNotExist) {
 			slog.Debug("not a ollama runtime directory, skipping", "path", match)
 			continue
 		} else if err != nil {
 			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
 			continue
 		}
 		pid, err := strconv.Atoi(string(raw))
 		if err != nil {
 			slog.Warn("invalid pid, skipping", "path", match, "error", err)
 			continue
 		}
 		p, err := os.FindProcess(pid)
 		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
 			slog.Warn("process still running, skipping", "pid", pid, "path", match)
 			continue
 		}
 		if err := os.Remove(match); err != nil {
 			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
 		}
 		runners := filepath.Join(filepath.Dir(match), "runners")
 		if err := os.RemoveAll(runners); err != nil {
 			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
 		}
 		if err := os.Remove(filepath.Dir(match)); err != nil {
 			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
 		}
 	}
 }
 func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
 	runnersDir := envconfig.RunnersDir()
 	if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
 		// We want to fully clean up the tmpdir parent of the payloads dir
 		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
 		slog.Debug("cleaning up", "dir", tmpDir)
 		err := os.RemoveAll(tmpDir)
 		if err != nil {
 			// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
 			time.Sleep(1000 * time.Millisecond)
 			err = os.RemoveAll(tmpDir)
 			if err != nil {
 				slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
 			}
 		}
 	}
 }
 func UpdatePath(dir string) {
 	if runtime.GOOS == "windows" {
 		tmpDir := filepath.Dir(dir)
 		pathComponents := strings.Split(os.Getenv("PATH"), ";")
 		i := 0
 		for _, comp := range pathComponents {
 			if strings.EqualFold(comp, dir) {
 				return
 			}
 			// Remove any other prior paths to our temp dir
 			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
 				pathComponents[i] = comp
 				i++
 			}
 		}
 		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
 		slog.Info("updating", "PATH", newPath)
 		os.Setenv("PATH", newPath)
 	}
 	// linux and darwin rely on rpath
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -93,10 +93,9 @@ func initCudaHandles() *cudaHandles {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
 	}
-	tmpDir, _ := PayloadsDir()
+	libDir := LibraryDir()
-	if tmpDir != "" {
+	if libDir != "" {
-		// TODO - add "payloads" for subprocess
+		cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
 		cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
 	}
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@ -913,7 +913,9 @@ struct llama_server_context
        slot.sampled = result.tok;
        // search stop word and delete it
-        slot.generated_text += token_str;
+        if (!llama_token_is_eog(model, result.tok))
            slot.generated_text += token_str;
        slot.has_next_token = true;
        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
@ -954,30 +956,36 @@ struct llama_server_context
        if (!incomplete)
        {
            size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
            const std::string str_test = slot.generated_text.substr(pos);
            bool is_stop_full = false;
            size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
            if (stop_pos != std::string::npos)
            {
                is_stop_full = true;
                slot.generated_text.erase(
                    slot.generated_text.begin() + pos + stop_pos,
                    slot.generated_text.end());
                pos = std::min(slot.n_sent_text, slot.generated_text.size());
            }
            else
            {
                is_stop_full = false;
                stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
            }
-            // check if there is any token to predict
+            if (!llama_token_is_eog(model, result.tok)) {
-            if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
+                const std::string str_test = slot.generated_text.substr(pos);
-            {
+                bool is_stop_full = false;
-                // no send the stop word in the response
+                size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
-                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
+                if (stop_pos != std::string::npos)
-                slot.n_sent_text += result.text_to_send.size();
+                {
-                // add the token to slot queue and cache
+                    is_stop_full = true;
                    slot.generated_text.erase(
                        slot.generated_text.begin() + pos + stop_pos,
                        slot.generated_text.end());
                    pos = std::min(slot.n_sent_text, slot.generated_text.size());
                }
                else
                {
                    is_stop_full = false;
                    stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
                }
                // check if there is any token to predict
                if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
                {
                    // no send the stop word in the response
                    result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
                    slot.n_sent_text += result.text_to_send.size();
                    // add the token to slot queue and cache
                }
            } else {
                    result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
                    slot.n_sent_text += result.text_to_send.size();
            }
            if (slot.params.stream)
@ -1117,9 +1125,7 @@ struct llama_server_context
            {"multimodal", multimodal}
        };
-        if (!llama_token_is_eog(model, tkn.tok)) {
+        res.result_json["content"] = tkn.text_to_send;
            res.result_json["content"] = tkn.text_to_send;
        }
        if (slot.sparams.n_probs > 0)
        {
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@ -31,6 +31,7 @@ init_vars() {
        NO_WHOLE_ARCHIVE=""
        GCC_ARCH="-arch ${ARCH}"
        DIST_BASE=../../dist/darwin-${GOARCH}/
        PAYLOAD_BASE=../../build/darwin/${GOARCH}
        ;;
    "Linux")
        LIB_EXT="so"
@ -40,6 +41,7 @@ init_vars() {
        # Cross compiling not supported on linux - Use docker
        GCC_ARCH=""
        DIST_BASE=../../dist/linux-${GOARCH}/
        PAYLOAD_BASE=../../build/linux/${GOARCH}
        ;;
    *)
        ;;
@ -47,7 +49,8 @@ init_vars() {
    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
    fi
-    GZIP=$(which pigz 2>/dev/null || echo "gzip")
+    GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
    RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
 }
 git_module_setup() {
@ -91,17 +94,34 @@ build() {
    rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
 }
-compress() {
+dist() {
-    echo "Compressing payloads to reduce overall binary size..."
+    [ -z "${RUNNER}" ] && exit 1
-    rm -rf ${BUILD_DIR}/bin/*.gz
+    mkdir -p ${RUNNER_BASE}/${RUNNER}/
    for f in ${BUILD_DIR}/bin/* ; do
-        ${GZIP} -n --best -f ${f} &
+        cp ${f} ${RUNNER_BASE}/${RUNNER}/
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
            cp ${f} ${RUNNER_BASE}/${RUNNER}/
        done
    fi
 }
 # Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
 compress() {
    [ -z "${RUNNER}" ] && exit 1
    echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
    rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
    mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
    for f in ${BUILD_DIR}/bin/* ; do
        ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
        compress_pids+=" $!"
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
-            ${GZIP} -n --best -f ${f} &
+            ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
            compress_pids+=" $!"
        done
    fi
@ -117,7 +137,7 @@ wait_for_compress() {
 install() {
    echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
-    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
+    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do
        rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
        cp -af "${lib}" "${BUILD_DIR}/bin/"
    done
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@ -39,7 +39,8 @@ case "${GOARCH}" in
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/cpu"
+        RUNNER=cpu
        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
        echo "Building LCD CPU"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
@ -51,7 +52,8 @@ case "${GOARCH}" in
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
+        RUNNER=cpu_avx
        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
        echo "Building AVX CPU"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
@ -63,7 +65,8 @@ case "${GOARCH}" in
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
+        RUNNER=cpu_avx2
        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
        echo "Building AVX2 CPU"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
        build
@ -84,7 +87,8 @@ case "${GOARCH}" in
    if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
        init_vars
        CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/metal"
+        RUNNER="metal"
        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -79,10 +79,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        init_vars
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        BUILD_DIR="../build/linux/${ARCH}/cpu"
+        RUNNER="cpu"
        BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
        echo "Building custom CPU"
        build
        install
        dist
        compress
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@ -102,10 +104,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
            #
            init_vars
            CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-            BUILD_DIR="../build/linux/${ARCH}/cpu"
+            RUNNER=cpu
            BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
            echo "Building LCD CPU"
            build
            install
            dist
            compress
        fi
@ -120,10 +124,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
+                RUNNER=cpu_avx
                BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
                echo "Building AVX CPU"
                build
                install
                dist
                compress
            fi
@ -134,10 +140,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
+                RUNNER=cpu_avx2
                BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
                echo "Building AVX2 CPU"
                build
                install
                dist
                compress
            fi
        fi
@ -187,11 +195,13 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
    fi
    export CUDAFLAGS="-t8"
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
-    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
+    RUNNER=cuda${CUDA_VARIANT}
    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
    build
    install
    dist
    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
    mkdir -p "${CUDA_DIST_DIR}"
    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
@ -212,7 +222,8 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
    CC=icx
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
-    BUILD_DIR="../build/linux/${ARCH}/oneapi"
+    RUNNER=oneapi
    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
@ -231,6 +242,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
    install
    dist
    compress
 fi
@ -259,7 +271,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
        echo "Building custom ROCM GPU"
    fi
-    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
+    RUNNER=rocm${ROCM_VARIANT}
    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
    # ROCm dependencies are too large to fit into a unified bundle
    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
    # TODO figure out how to disable runpath (rpath)
@ -269,13 +282,17 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
    # copy the ROCM dependencies
    mkdir -p "${ROCM_DIST_DIR}"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
        cp -a "${dep}"* "${ROCM_DIST_DIR}"
        if [ $(readlink -f "${dep}") != "${dep}" ] ; then
            cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
        fi
    done
    install
    dist
    compress
 fi
 cleanup
 wait_for_compress
-echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
+echo "go generate completed.  LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
@ -1,11 +1,7 @@
 package llm
 import (
 	"embed"
 	"syscall"
 )
 //go:embed build/darwin/arm64/*/bin/*
 var libEmbed embed.FS
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
--- a/llm/llm_darwin_amd64.go
+++ b/llm/llm_darwin_amd64.go
@ -1,11 +0,0 @@
 package llm
 import (
 	"embed"
 	"syscall"
 )
 //go:embed build/darwin/x86_64/*/bin/*
 var libEmbed embed.FS
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
--- a/llm/llm_linux.go
+++ b/llm/llm_linux.go
@ -1,11 +1,7 @@
 package llm
 import (
 	"embed"
 	"syscall"
 )
 //go:embed build/linux/*/*/bin/*
 var libEmbed embed.FS
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@ -1,13 +1,9 @@
 package llm
 import (
 	"embed"
 	"syscall"
 )
 // unused on windows
 var libEmbed embed.FS
 const CREATE_DEFAULT_ERROR_MODE = 0x04000000
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{
--- a/llm/payload.go
+++ b/llm/payload.go
@ -1,233 +0,0 @@
 package llm
 import (
 	"compress/gzip"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"slices"
 	"strings"
 	"golang.org/x/sync/errgroup"
 	"github.com/ollama/ollama/gpu"
 )
 var errPayloadMissing = errors.New("expected payloads not included in this build of ollama")
 func Init() error {
 	payloadsDir, err := gpu.PayloadsDir()
 	if err != nil {
 		return err
 	}
 	if runtime.GOOS != "windows" {
 		slog.Info("extracting embedded files", "dir", payloadsDir)
 		binGlob := "build/*/*/*/bin/*"
 		// extract server libraries
 		err = extractFiles(payloadsDir, binGlob)
 		if err != nil {
 			return fmt.Errorf("extract binaries: %v", err)
 		}
 	}
 	var variants []string
 	for v := range getAvailableServers() {
 		variants = append(variants, v)
 	}
 	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
 	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
 	return nil
 }
 // binary names may contain an optional variant separated by '_'
 // For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
 // Any library without a variant is the lowest common denominator
 func getAvailableServers() map[string]string {
 	payloadsDir, err := gpu.PayloadsDir()
 	if err != nil {
 		slog.Error("payload lookup error", "error", err)
 		return nil
 	}
 	// glob payloadsDir for files that start with ollama_
 	pattern := filepath.Join(payloadsDir, "*", "ollama_*")
 	files, err := filepath.Glob(pattern)
 	if err != nil {
 		slog.Debug("could not glob", "pattern", pattern, "error", err)
 		return nil
 	}
 	servers := make(map[string]string)
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
 		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
 	}
 	return servers
 }
 // serversForGpu returns a list of compatible servers give the provided GPU
 // info, ordered by performance. assumes Init() has been called
 // TODO - switch to metadata based mapping
 func serversForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := getAvailableServers()
 	requested := info.Library
 	if info.Variant != gpu.CPUCapabilityNone.String() {
 		requested += "_" + info.Variant
 	}
 	servers := []string{}
 	// exact match first
 	for a := range availableServers {
 		if a == requested {
 			servers = []string{a}
 			if a == "metal" {
 				return servers
 			}
 			break
 		}
 	}
 	alt := []string{}
 	// Then for GPUs load alternates and sort the list for consistent load ordering
 	if info.Library != "cpu" {
 		for a := range availableServers {
 			if info.Library == strings.Split(a, "_")[0] && a != requested {
 				alt = append(alt, a)
 			}
 		}
 		slices.Sort(alt)
 		servers = append(servers, alt...)
 	}
 	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
 		// Load up the best CPU variant if not primary requested
 		if info.Library != "cpu" {
 			variant := gpu.GetCPUCapability()
 			// If no variant, then we fall back to default
 			// If we have a variant, try that if we find an exact match
 			// Attempting to run the wrong CPU instructions will panic the
 			// process
 			if variant != gpu.CPUCapabilityNone {
 				for cmp := range availableServers {
 					if cmp == "cpu_"+variant.String() {
 						servers = append(servers, cmp)
 						break
 					}
 				}
 			} else {
 				servers = append(servers, "cpu")
 			}
 		}
 		if len(servers) == 0 {
 			servers = []string{"cpu"}
 		}
 	}
 	return servers
 }
 // Return the optimal server for this CPU architecture
 func serverForCpu() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		return "metal"
 	}
 	variant := gpu.GetCPUCapability()
 	availableServers := getAvailableServers()
 	if variant != gpu.CPUCapabilityNone {
 		for cmp := range availableServers {
 			if cmp == "cpu_"+variant.String() {
 				return cmp
 			}
 		}
 	}
 	return "cpu"
 }
 // extract extracts the embedded files to the target directory
 func extractFiles(targetDir string, glob string) error {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return errPayloadMissing
 	}
 	if err := os.MkdirAll(targetDir, 0o755); err != nil {
 		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
 	}
 	g := new(errgroup.Group)
 	// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
 	for _, file := range files {
 		filename := file
 		variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
 		slog.Debug("extracting", "variant", variant, "file", filename)
 		g.Go(func() error {
 			srcf, err := libEmbed.Open(filename)
 			if err != nil {
 				return err
 			}
 			defer srcf.Close()
 			src := io.Reader(srcf)
 			if strings.HasSuffix(filename, ".gz") {
 				src, err = gzip.NewReader(src)
 				if err != nil {
 					return fmt.Errorf("decompress payload %s: %v", filename, err)
 				}
 				filename = strings.TrimSuffix(filename, ".gz")
 			}
 			variantDir := filepath.Join(targetDir, variant)
 			if err := os.MkdirAll(variantDir, 0o755); err != nil {
 				return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
 			}
 			base := filepath.Base(filename)
 			destFilename := filepath.Join(variantDir, base)
 			_, err = os.Stat(destFilename)
 			switch {
 			case errors.Is(err, os.ErrNotExist):
 				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 				if err != nil {
 					return fmt.Errorf("write payload %s: %v", filename, err)
 				}
 				defer destFile.Close()
 				if _, err := io.Copy(destFile, src); err != nil {
 					return fmt.Errorf("copy payload %s: %v", filename, err)
 				}
 			case err != nil:
 				return fmt.Errorf("stat payload %s: %v", filename, err)
 			}
 			return nil
 		})
 	}
 	err = g.Wait()
 	if err != nil {
 		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
 		gpu.Cleanup()
 		return err
 	}
 	return nil
 }
--- a/llm/server.go
+++ b/llm/server.go
@ -24,9 +24,11 @@ import (
 	"golang.org/x/sync/semaphore"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/runners"
 )
 type LlamaServer interface {
@ -106,7 +108,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		gpus = gpu.GetCPUInfo()
 	}
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
-		cpuRunner = serverForCpu()
+		cpuRunner = runners.ServerForCpu()
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
@ -118,7 +120,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			opts.NumGPU = 0
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
-			cpuRunner = serverForCpu()
+			cpuRunner = runners.ServerForCpu()
 			gpus = gpu.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
@ -145,25 +147,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
 	}
-	availableServers := getAvailableServers()
+	rDir, err := runners.Refresh(build.EmbedFS)
 	if err != nil {
 		return nil, err
 	}
 	availableServers := runners.GetAvailableServers(rDir)
 	if len(availableServers) == 0 {
-		if runtime.GOOS != "windows" {
+		return nil, finalErr
 			slog.Warn("llama server binary disappeared, reinitializing payloads")
 			err = Init()
 			if err != nil {
 				slog.Warn("failed to reinitialize payloads", "error", err)
 				return nil, err
 			}
 			availableServers = getAvailableServers()
 		} else {
 			return nil, finalErr
 		}
 	}
 	var servers []string
 	if cpuRunner != "" {
 		servers = []string{cpuRunner}
 	} else {
-		servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
+		servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
 	}
 	demandLib := envconfig.LLMLibrary()
 	if demandLib != "" {
@ -274,7 +271,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--tensor-split", estimate.TensorSplit)
 	}
-	for i := range len(servers) {
+	for i := range servers {
 		dir := availableServers[servers[i]]
 		if dir == "" {
 			// Shouldn't happen
@ -330,7 +327,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		_, err := os.Stat(server)
 		if errors.Is(err, os.ErrNotExist) {
 			slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
-			err = Init()
+			_, err = runners.Refresh(build.EmbedFS)
 			if err != nil {
 				slog.Warn("failed to reinitialize payloads", "error", err)
 				return nil, err
--- a/runners/common.go
+++ b/runners/common.go
@ -0,0 +1,384 @@
 package runners
 import (
 	"compress/gzip"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"slices"
 	"strconv"
 	"strings"
 	"sync"
 	"syscall"
 	"golang.org/x/sync/errgroup"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/gpu"
 )
 const (
 	binGlob = "*/*/*/*"
 )
 var (
 	lock       sync.Mutex
 	runnersDir = ""
 )
 // Return the location where runners are stored
 // If runners are payloads, this will either extract them
 // or refresh them if any have disappeared due to tmp cleaners
 func Refresh(payloadFS fs.FS) (string, error) {
 	lock.Lock()
 	defer lock.Unlock()
 	var err error
 	// Wire up extra logging on our first load
 	if runnersDir == "" {
 		defer func() {
 			var runners []string
 			for v := range GetAvailableServers(runnersDir) {
 				runners = append(runners, v)
 			}
 			slog.Info("Dynamic LLM libraries", "runners", runners)
 			slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
 		}()
 	}
 	if hasPayloads(payloadFS) {
 		if runnersDir == "" {
 			runnersDir, err = extractRunners(payloadFS)
 		} else {
 			err = refreshRunners(payloadFS, runnersDir)
 		}
 	} else if runnersDir == "" {
 		runnersDir, err = locateRunners()
 	}
 	return runnersDir, err
 }
 func Cleanup(payloadFS fs.FS) {
 	lock.Lock()
 	defer lock.Unlock()
 	if hasPayloads(payloadFS) && runnersDir != "" {
 		// We want to fully clean up the tmpdir parent of the payloads dir
 		tmpDir := filepath.Clean(filepath.Join(runnersDir, ".."))
 		slog.Debug("cleaning up", "dir", tmpDir)
 		err := os.RemoveAll(tmpDir)
 		if err != nil {
 			slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
 		}
 	}
 }
 func locateRunners() (string, error) {
 	exe, err := os.Executable()
 	if err != nil {
 		return "", err
 	}
 	cwd, err := os.Getwd()
 	if err != nil {
 		return "", err
 	}
 	var paths []string
 	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} {
 		paths = append(paths,
 			root,
 			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
 			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
 		)
 	}
 	// Try a few variations to improve developer experience when building from source in the local tree
 	for _, path := range paths {
 		candidate := filepath.Join(path, "lib", "ollama", "runners")
 		if _, err := os.Stat(candidate); err == nil {
 			return candidate, nil
 		}
 	}
 	return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
 }
 // Return true if we're carying nested payloads for the runners
 func hasPayloads(payloadFS fs.FS) bool {
 	files, err := fs.Glob(payloadFS, binGlob)
 	if err != nil || len(files) == 0 || (len(files) == 1 && strings.Contains(files[0], "placeholder")) {
 		return false
 	}
 	return true
 }
 func extractRunners(payloadFS fs.FS) (string, error) {
 	cleanupTmpDirs()
 	tmpDir, err := os.MkdirTemp(envconfig.TmpDir(), "ollama")
 	if err != nil {
 		return "", fmt.Errorf("failed to generate tmp dir: %w", err)
 	}
 	// Track our pid so we can clean up orphaned tmpdirs
 	n := filepath.Join(tmpDir, "ollama.pid")
 	if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
 		slog.Warn("failed to write pid file", "file", n, "error", err)
 	}
 	// We create a distinct subdirectory for payloads within the tmpdir
 	// This will typically look like /tmp/ollama3208993108/runners on linux
 	rDir := filepath.Join(tmpDir, "runners")
 	slog.Info("extracting embedded files", "dir", rDir)
 	return rDir, refreshRunners(payloadFS, rDir)
 }
 func refreshRunners(payloadFS fs.FS, rDir string) error {
 	// extract or refresh server libraries
 	err := extractFiles(payloadFS, rDir, binGlob)
 	if err != nil {
 		return fmt.Errorf("extract binaries: %v", err)
 	}
 	return nil
 }
 // extract extracts the embedded files to the target directory
 func extractFiles(payloadFS fs.FS, targetDir string, glob string) error {
 	files, err := fs.Glob(payloadFS, glob)
 	if err != nil || len(files) == 0 {
 		// Should not happen
 		return fmt.Errorf("extractFiles called without payload present")
 	}
 	if err := os.MkdirAll(targetDir, 0o755); err != nil {
 		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
 	}
 	g := new(errgroup.Group)
 	// $OS/$GOARCH/$RUNNER/$FILE
 	for _, file := range files {
 		filename := file
 		runner := filepath.Base(filepath.Dir(filename))
 		slog.Debug("extracting", "runner", runner, "payload", filename)
 		g.Go(func() error {
 			srcf, err := payloadFS.Open(filename)
 			if err != nil {
 				return err
 			}
 			defer srcf.Close()
 			src := io.Reader(srcf)
 			if strings.HasSuffix(filename, ".gz") {
 				src, err = gzip.NewReader(src)
 				if err != nil {
 					return fmt.Errorf("decompress payload %s: %v", filename, err)
 				}
 				filename = strings.TrimSuffix(filename, ".gz")
 			}
 			runnerDir := filepath.Join(targetDir, runner)
 			if err := os.MkdirAll(runnerDir, 0o755); err != nil {
 				return fmt.Errorf("extractFiles could not mkdir %s: %v", runnerDir, err)
 			}
 			base := filepath.Base(filename)
 			destFilename := filepath.Join(runnerDir, base)
 			_, err = os.Stat(destFilename)
 			switch {
 			case errors.Is(err, os.ErrNotExist):
 				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 				if err != nil {
 					return fmt.Errorf("write payload %s: %v", filename, err)
 				}
 				defer destFile.Close()
 				if _, err := io.Copy(destFile, src); err != nil {
 					return fmt.Errorf("copy payload %s: %v", filename, err)
 				}
 			case err != nil:
 				return fmt.Errorf("stat payload %s: %v", filename, err)
 			}
 			return nil
 		})
 	}
 	err = g.Wait()
 	if err != nil {
 		slog.Error("failed to extract files", "error", err)
 		// If we fail to extract, the payload dir is most likely unusable, so cleanup whatever we extracted
 		err := os.RemoveAll(targetDir)
 		if err != nil {
 			slog.Warn("failed to cleanup incomplete payload dir", "dir", targetDir, "error", err)
 		}
 		return err
 	}
 	return nil
 }
 // Best effort to clean up prior tmpdirs
 func cleanupTmpDirs() {
 	tmpDir := envconfig.TmpDir()
 	if tmpDir == "" {
 		tmpDir = os.TempDir()
 	}
 	matches, err := filepath.Glob(filepath.Join(tmpDir, "ollama*", "ollama.pid"))
 	if err != nil {
 		return
 	}
 	for _, match := range matches {
 		raw, err := os.ReadFile(match)
 		if errors.Is(err, os.ErrNotExist) {
 			slog.Debug("not a ollama runtime directory, skipping", "path", match)
 			continue
 		} else if err != nil {
 			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
 			continue
 		}
 		pid, err := strconv.Atoi(string(raw))
 		if err != nil {
 			slog.Warn("invalid pid, skipping", "path", match, "error", err)
 			continue
 		}
 		p, err := os.FindProcess(pid)
 		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
 			slog.Warn("process still running, skipping", "pid", pid, "path", match)
 			continue
 		}
 		if err := os.Remove(match); err != nil {
 			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
 		}
 		runners := filepath.Join(filepath.Dir(match), "runners")
 		if err := os.RemoveAll(runners); err != nil {
 			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
 		}
 		if err := os.Remove(filepath.Dir(match)); err != nil {
 			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
 		}
 	}
 }
 // directory names are the name of the runner and may contain an optional
 // variant prefixed with '_' as the separator. For example, "cuda_v11" and
 // "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
 // lowest common denominator
 func GetAvailableServers(payloadsDir string) map[string]string {
 	if payloadsDir == "" {
 		slog.Error("empty runner dir")
 		return nil
 	}
 	// glob payloadsDir for files that start with ollama_
 	pattern := filepath.Join(payloadsDir, "*", "ollama_*")
 	files, err := filepath.Glob(pattern)
 	if err != nil {
 		slog.Debug("could not glob", "pattern", pattern, "error", err)
 		return nil
 	}
 	servers := make(map[string]string)
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
 		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
 	}
 	return servers
 }
 // serversForGpu returns a list of compatible servers give the provided GPU
 // info, ordered by performance. assumes Init() has been called
 // TODO - switch to metadata based mapping
 func ServersForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := GetAvailableServers(runnersDir)
 	requested := info.Library
 	if info.Variant != gpu.CPUCapabilityNone.String() {
 		requested += "_" + info.Variant
 	}
 	servers := []string{}
 	// exact match first
 	for a := range availableServers {
 		if a == requested {
 			servers = []string{a}
 			if a == "metal" {
 				return servers
 			}
 			break
 		}
 	}
 	alt := []string{}
 	// Then for GPUs load alternates and sort the list for consistent load ordering
 	if info.Library != "cpu" {
 		for a := range availableServers {
 			if info.Library == strings.Split(a, "_")[0] && a != requested {
 				alt = append(alt, a)
 			}
 		}
 		slices.Sort(alt)
 		servers = append(servers, alt...)
 	}
 	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
 		// Load up the best CPU variant if not primary requested
 		if info.Library != "cpu" {
 			variant := gpu.GetCPUCapability()
 			// If no variant, then we fall back to default
 			// If we have a variant, try that if we find an exact match
 			// Attempting to run the wrong CPU instructions will panic the
 			// process
 			if variant != gpu.CPUCapabilityNone {
 				for cmp := range availableServers {
 					if cmp == "cpu_"+variant.String() {
 						servers = append(servers, cmp)
 						break
 					}
 				}
 			} else {
 				servers = append(servers, "cpu")
 			}
 		}
 		if len(servers) == 0 {
 			servers = []string{"cpu"}
 		}
 	}
 	return servers
 }
 // Return the optimal server for this CPU architecture
 func ServerForCpu() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		return "metal"
 	}
 	variant := gpu.GetCPUCapability()
 	availableServers := GetAvailableServers(runnersDir)
 	if variant != gpu.CPUCapabilityNone {
 		for cmp := range availableServers {
 			if cmp == "cpu_"+variant.String() {
 				return cmp
 			}
 		}
 	}
 	return "cpu"
 }
--- a/runners/runners_test.go
+++ b/runners/runners_test.go
@ -0,0 +1,50 @@
 package runners
 import (
 	"log/slog"
 	"os"
 	"path"
 	"runtime"
 	"strings"
 	"testing"
 	"testing/fstest"
 )
 func TestRefreshRunners(t *testing.T) {
 	slog.SetLogLoggerLevel(slog.LevelDebug)
 	payloadFS := fstest.MapFS{
 		path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")},
 	}
 	tmpDir, err := os.MkdirTemp("", "testing")
 	if err != nil {
 		t.Fatalf("failed to make tmp dir %s", err)
 	}
 	t.Setenv("OLLAMA_TMPDIR", tmpDir)
 	rDir, err := Refresh(payloadFS)
 	if err != nil {
 		t.Fatalf("failed to extract to %s %s", tmpDir, err)
 	}
 	if !strings.Contains(rDir, tmpDir) {
 		t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
 	}
 	// spot check results
 	servers := GetAvailableServers(rDir)
 	if len(servers) < 1 {
 		t.Fatalf("expected at least 1 server")
 	}
 	// Refresh contents
 	rDir, err = extractRunners(payloadFS)
 	if err != nil {
 		t.Fatalf("failed to extract to %s %s", tmpDir, err)
 	}
 	if !strings.Contains(rDir, tmpDir) {
 		t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
 	}
 	cleanupTmpDirs()
 	Cleanup(payloadFS)
 }
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@ -2,8 +2,7 @@
 set -e
-export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
+. $(dirname $0)/env.sh
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
 mkdir -p dist
--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
@ -2,76 +2,34 @@
 set -eu
-export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
+. $(dirname $0)/env.sh
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
 # We use 2 different image repositories to handle combining architecture images into multiarch manifest
 # (The ROCm image is x86 only and is not a multiarch manifest)
 # For developers, you can override the DOCKER_ORG to generate multiarch manifests
 #  DOCKER_ORG=jdoe PUSH=1 ./scripts/build_docker.sh
 DOCKER_ORG=${DOCKER_ORG:-"ollama"}
 RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
 FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
 BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 # Set PUSH to a non-empty string to trigger push instead of load
 PUSH=${PUSH:-""}
 # In CI mode, we break things down
 OLLAMA_SKIP_MANIFEST_CREATE=${OLLAMA_SKIP_MANIFEST_CREATE:-""}
 OLLAMA_SKIP_IMAGE_BUILD=${OLLAMA_SKIP_IMAGE_BUILD:-""}
 if [ -z "${PUSH}" ] ; then
    echo "Building ${FINAL_IMAGE_REPO}:$VERSION locally.  set PUSH=1 to push"
    LOAD_OR_PUSH="--load"
 else
-    echo "Will be pushing ${RELEASE_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}"
+    echo "Will be pushing ${FINAL_IMAGE_REPO}:$VERSION"
    LOAD_OR_PUSH="--push"
 fi
-if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then
+docker buildx build \
-    for TARGETARCH in ${BUILD_ARCH}; do
+    ${LOAD_OR_PUSH} \
-        docker build \
+    --platform=${PLATFORM} \
-            ${LOAD_OR_PUSH} \
+    ${OLLAMA_COMMON_BUILD_ARGS} \
-            --platform=linux/${TARGETARCH} \
+    -f Dockerfile \
-            --build-arg=VERSION \
+    -t ${FINAL_IMAGE_REPO}:$VERSION \
-            --build-arg=GOFLAGS \
+    .
            -f Dockerfile \
            -t ${RELEASE_IMAGE_REPO}:$VERSION-${TARGETARCH} \
            .
    done
-    if echo ${BUILD_ARCH} | grep "amd64" > /dev/null; then
+if echo $PLATFORM | grep "amd64" > /dev/null; then
-        docker build \
+    docker buildx build \
-            ${LOAD_OR_PUSH} \
+        ${LOAD_OR_PUSH} \
-            --platform=linux/amd64 \
+        --platform=linux/amd64 \
-            --build-arg=VERSION \
+        ${OLLAMA_COMMON_BUILD_ARGS} \
-            --build-arg=GOFLAGS \
+        --target runtime-rocm \
-            --target runtime-rocm \
+        -f Dockerfile \
-            -f Dockerfile \
+        -t ${FINAL_IMAGE_REPO}:$VERSION-rocm \
-            -t ${RELEASE_IMAGE_REPO}:$VERSION-rocm \
+        .
-            .
+fi
    fi
 fi
 if [ -z "${OLLAMA_SKIP_MANIFEST_CREATE}" ]; then
    if [ -n "${PUSH}" ]; then
        docker manifest create ${FINAL_IMAGE_REPO}:$VERSION \
            ${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
            ${RELEASE_IMAGE_REPO}:$VERSION-arm64
        docker manifest push ${FINAL_IMAGE_REPO}:$VERSION
        # For symmetry, tag/push the rocm image
        if [ "${RELEASE_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then
            echo "Tagging and pushing rocm image"
            docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
            docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm
            docker push ${FINAL_IMAGE_REPO}:$VERSION-rocm
        fi
    else
        echo "Skipping manifest generation when not pushing images are available locally as "
        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-amd64"
        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-arm64"
        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-rocm"
    fi
 fi
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@ -1,37 +1,29 @@
 #!/bin/sh
 #
 # Mac ARM users, rosetta can be flaky, so to use a remote x86 builder
 #
 # docker context create amd64 --docker host=ssh://mybuildhost
 # docker buildx create --name mybuilder amd64 --platform linux/amd64
 # docker buildx create --name mybuilder --append desktop-linux --platform linux/arm64
 # docker buildx use mybuilder
 set -eu
-export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
+. $(dirname $0)/env.sh
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
 GZIP=$(which pigz 2>/dev/null || echo "gzip")
 BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
 mkdir -p dist
-for TARGETARCH in ${BUILD_ARCH}; do
+docker buildx build \
-    docker build \
+        --output type=local,dest=./dist/ \
-        --platform=linux/$TARGETARCH \
+        --platform=${PLATFORM} \
-        --build-arg=GOFLAGS \
+        ${OLLAMA_COMMON_BUILD_ARGS} \
-        --build-arg=CGO_CFLAGS \
+        --target dist \
        --build-arg=OLLAMA_CUSTOM_CPU_DEFS \
        --build-arg=AMDGPU_TARGETS \
        --target build-$TARGETARCH \
        -f Dockerfile \
        -t builder:$TARGETARCH \
        .
-    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
+
-    rm -rf ./dist/linux-$TARGETARCH
+# buildx behavior changes for single vs. multiplatform
-    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
+if echo $PLATFORM | grep "," > /dev/null ; then 
-    if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
+        mv -f ./dist/linux_*64/ollama* ./dist/
-        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
+        rmdir ./dist/linux_*64
-    fi
+fi
    docker rm builder-$TARGETARCH
    echo "Compressing final linux bundle..."
    rm -f ./dist/ollama-linux-$TARGETARCH.tgz
    (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
    if [ -d dist/linux-$TARGETARCH-rocm ]; then
        (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
    fi
 done
--- a/scripts/env.sh
+++ b/scripts/env.sh
@ -0,0 +1,14 @@
 # Common environment setup across build*.sh scripts
 export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
 # TODO - consider `docker buildx ls --format=json` to autodiscover platform capability
 PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"}
 DOCKER_ORG=${DOCKER_ORG:-"ollama"}
 RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
 FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
 OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION --build-arg=GOFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS --build-arg=AMDGPU_TARGETS"
 echo "Building Ollama"
 echo "VERSION=$VERSION"
 echo "PLATFORM=$PLATFORM"
--- a/server/routes.go
+++ b/server/routes.go
@ -26,11 +26,13 @@ import (
 	"golang.org/x/sync/errgroup"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/runners"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
@ -117,6 +119,32 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}
 	// expire the runner
 	if req.Prompt == "" && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 {
 		model, err := GetModel(req.Model)
 		if err != nil {
 			switch {
 			case os.IsNotExist(err):
 				c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
 			case err.Error() == "invalid model name":
 				c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 			default:
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			}
 			return
 		}
 		s.sched.expireRunner(model)
 		c.JSON(http.StatusOK, api.GenerateResponse{
 			Model:      req.Model,
 			CreatedAt:  time.Now().UTC(),
 			Response:   "",
 			Done:       true,
 			DoneReason: "unload",
 		})
 		return
 	}
 	if req.Format != "" && req.Format != "json" {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""})
 		return
@ -1190,12 +1218,12 @@ func Serve(ln net.Listener) error {
 		srvr.Close()
 		schedDone()
 		sched.unloadAllRunners()
-		gpu.Cleanup()
+		runners.Cleanup(build.EmbedFS)
 		done()
 	}()
-	if err := llm.Init(); err != nil {
+	if _, err := runners.Refresh(build.EmbedFS); err != nil {
-		return fmt.Errorf("unable to initialize llm library %w", err)
+		return fmt.Errorf("unable to initialize llm runners %w", err)
 	}
 	s.sched.Run(schedCtx)
@ -1322,6 +1350,32 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
 	// expire the runner
 	if len(req.Messages) == 0 && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 {
 		model, err := GetModel(req.Model)
 		if err != nil {
 			switch {
 			case os.IsNotExist(err):
 				c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
 			case err.Error() == "invalid model name":
 				c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 			default:
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			}
 			return
 		}
 		s.sched.expireRunner(model)
 		c.JSON(http.StatusOK, api.ChatResponse{
 			Model:      req.Model,
 			CreatedAt:  time.Now().UTC(),
 			Message:    api.Message{Role: "assistant"},
 			Done:       true,
 			DoneReason: "unload",
 		})
 		return
 	}
 	caps := []Capability{CapabilityCompletion}
 	if len(req.Tools) > 0 {
 		caps = append(caps, CapabilityTools)
--- a/server/sched.go
+++ b/server/sched.go
@ -360,7 +360,6 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 			slog.Debug("runner expired event received", "modelPath", runner.modelPath)
 			runner.refMu.Lock()
 			if runner.refCount > 0 {
 				// Shouldn't happen, but safeguard to ensure no leaked runners
 				slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount)
 				go func(runner *runnerRef) {
 					// We can't unload yet, but want to as soon as the current request completes
@ -802,6 +801,25 @@ func (s *Scheduler) unloadAllRunners() {
 	}
 }
 func (s *Scheduler) expireRunner(model *Model) {
 	s.loadedMu.Lock()
 	defer s.loadedMu.Unlock()
 	runner, ok := s.loaded[model.ModelPath]
 	if ok {
 		runner.refMu.Lock()
 		runner.expiresAt = time.Now()
 		if runner.expireTimer != nil {
 			runner.expireTimer.Stop()
 			runner.expireTimer = nil
 		}
 		runner.sessionDuration = 0
 		if runner.refCount <= 0 {
 			s.expiredCh <- runner
 		}
 		runner.refMu.Unlock()
 	}
 }
 // If other runners are loaded, make sure the pending request will fit in system memory
 // If not, pick a runner to unload, else return nil and the request can be loaded
 func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -406,6 +406,52 @@ func TestGetRunner(t *testing.T) {
 	b.ctxDone()
 }
 func TestExpireRunner(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
 	defer done()
 	s := InitScheduler(ctx)
 	req := &LlmRequest{
 		ctx:             ctx,
 		model:           &Model{ModelPath: "foo"},
 		opts:            api.DefaultOptions(),
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
 	}
 	var ggml *llm.GGML
 	gpus := gpu.GpuInfoList{}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
 	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return server, nil
 	}
 	s.load(req, ggml, gpus, 0)
 	select {
 	case err := <-req.errCh:
 		if err != nil {
 			t.Fatalf("expected no errors when loading, got '%s'", err.Error())
 		}
 	case resp := <-req.successCh:
 		s.loadedMu.Lock()
 		if resp.refCount != uint(1) || len(s.loaded) != 1 {
 			t.Fatalf("expected a model to be loaded")
 		}
 		s.loadedMu.Unlock()
 	}
 	s.expireRunner(&Model{ModelPath: "foo"})
 	s.finishedReqCh <- req
 	s.processCompleted(ctx)
 	s.loadedMu.Lock()
 	if len(s.loaded) != 0 {
 		t.Fatalf("expected model to be unloaded")
 	}
 	s.loadedMu.Unlock()
 }
 // TODO - add one scenario that triggers the bogus finished event with positive ref count
 func TestPrematureExpired(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
		`@ -0,0 +1 @@`
							`This is here to make sure the build/ directory exists for the go:embed command`