Optimize container images for startup (#6547)

* Optimize container images for startup This change adjusts how to handle runner payloads to support container builds where we keep them extracted in the filesystem. This makes it easier to optimize the cpu/cuda vs cpu/rocm images for size, and should result in faster startup times for container images. * Refactor payload logic and add buildx support for faster builds * Move payloads around * Review comments * Converge to buildx based helper scripts * Use docker buildx action for release
2024-09-12 12:10:30 -07:00 · 2024-09-12 12:10:30 -07:00 · cd5c8f6471
commit cd5c8f6471
parent fef257c5c5
32 changed files with 861 additions and 689 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -7,3 +7,5 @@ llm/llama.cpp
 .env
 .cache
 test_data
 llm/build
 llama/build
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -102,8 +102,8 @@ jobs:
        with:
          name: generate-windows-cpu
          path: |
-            llm/build/**/bin/*
+            build/**/*
-            llm/build/**/*.a
+            build/**/*.a
            dist/windows-amd64/**
  # ROCm generation step
@ -176,7 +176,7 @@ jobs:
        with:
          name: generate-windows-rocm
          path: |
-            llm/build/**/bin/*
+            build/**/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
@ -265,7 +265,7 @@ jobs:
        with:
          name: generate-windows-cuda-${{ matrix.cuda.version }}
          path: |
-            llm/build/**/bin/*
+            build/**/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
@ -338,7 +338,7 @@ jobs:
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
-      - run: dir llm/build
+      - run: dir build
      - run: |
          $gopath=(get-command go).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
@ -359,9 +359,7 @@ jobs:
    environment: release
    runs-on: linux
    env:
      OLLAMA_SKIP_MANIFEST_CREATE: '1'
      BUILD_ARCH: amd64
      PUSH: '1'
    steps:
      - uses: actions/checkout@v4
        with:
@ -369,14 +367,8 @@ jobs:
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: |
          ./scripts/build_linux.sh
          ./scripts/build_docker.sh
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-amd64
@ -390,9 +382,7 @@ jobs:
    environment: release
    runs-on: linux-arm64
    env:
      OLLAMA_SKIP_MANIFEST_CREATE: '1'
      BUILD_ARCH: arm64
      PUSH: '1'
    steps:
      - uses: actions/checkout@v4
        with:
@ -421,14 +411,8 @@ jobs:
          sudo usermod -aG docker $USER
          sudo apt-get install acl
          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: |
          ./scripts/build_linux.sh
          ./scripts/build_docker.sh
      - uses: actions/upload-artifact@v4
        with:
          name: dist-linux-arm64
@ -436,6 +420,181 @@ jobs:
            dist/*linux*
            !dist/*-cov
  # Container image build
  build-linux:
    environment: release
    strategy:
      matrix:
        runner:
          - linux
          - linux-arm64
    runs-on: ${{ matrix.runner }}
    env:
      FINAL_IMAGE_REPO: ollama/ollama
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - name: 'Install Docker'
        if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
        run: |
          sudo apt-get update
          sudo apt-get install -y ca-certificates curl
          sudo install -m 0755 -d /etc/apt/keyrings
          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
          sudo chmod a+r /etc/apt/keyrings/docker.asc
          echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
            $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
            sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
          sudo apt-get update
          sudo apt-get install -y docker-ce docker-ce-cli containerd.io
          sudo usermod -aG docker $USER
          sudo apt-get install acl
          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.FINAL_IMAGE_REPO }}
          flavor: |
            latest=false
          tags: |
            type=ref,event=tag
            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
            type=semver,pattern={{version}}
      - name: Set Version
        shell: bash
        run: |
          machine=$(uname -m)
          case ${machine} in
            x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
            aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
          esac >>$GITHUB_ENV
          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - name: Build and push by digest
        id: build
        uses: docker/build-push-action@v6
        with:
          context: "."
          platforms: linux/${{ env.ARCH }}
          build-args: |
            GOFLAGS
          outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
      - name: Export digest
        run: |
          mkdir -p /tmp/digests
          digest="${{ steps.build.outputs.digest }}"
          touch "/tmp/digests/${digest#sha256:}"
      - name: Upload digest
        uses: actions/upload-artifact@v4
        with:
          name: digests-${{ env.PLATFORM_PAIR }}
          path: /tmp/digests/*
          if-no-files-found: error
          retention-days: 1
  merge:
    environment: release
    runs-on: linux
    needs:
      - build-linux
    env:
      FINAL_IMAGE_REPO: ollama/ollama
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - name: Download digests
        uses: actions/download-artifact@v4
        with:
          path: /tmp/digests
          pattern: digests-*
          merge-multiple: true
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.FINAL_IMAGE_REPO }}
          flavor: |
            latest=false
          tags: |
            type=ref,event=tag
            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
            type=semver,pattern={{version}}
      - name: Set Version
        shell: bash
        run: |
          machine=$(uname -m)
          case ${machine} in
            x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
            aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
          esac >>$GITHUB_ENV
          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - name: Create manifest list and push
        working-directory: /tmp/digests
        run: |
          docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
            $(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
      - name: Inspect image
        run: |
          docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}          
  build-linux-rocm:
    environment: release
    runs-on: linux
    env:
      FINAL_IMAGE_REPO: ollama/ollama
      ARCH: amd64
      PLATFORM_PAIR: linux-amd64
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.FINAL_IMAGE_REPO }}
          flavor: |
            latest=false
          tags: |
            type=ref,event=tag
            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
            type=semver,pattern={{version}}
      - name: Set Version
        shell: bash
        run: |
          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - name: Build and push by digest
        id: build
        uses: docker/build-push-action@v6
        with:
          context: "."
          target: runtime-rocm
          build-args: |
            GOFLAGS
          tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm,${{ env.FINAL_IMAGE_REPO }}:rocm
          push: true
  # Aggregate all the assets and ship a release
  release:
    needs:
@ -448,8 +607,6 @@ jobs:
    permissions:
      contents: write
    env:
      OLLAMA_SKIP_IMAGE_BUILD: '1'
      PUSH: '1'
      GH_TOKEN: ${{ github.token }}
    steps:
      - uses: actions/checkout@v4
@ -458,12 +615,6 @@ jobs:
        run: |
          echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
          echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ vars.DOCKER_USER }}
          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
      - run: ./scripts/build_docker.sh
      - name: Retrieve built artifact
        uses: actions/download-artifact@v4
        with:
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -81,12 +81,6 @@ jobs:
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
        name: 'Unix Go Generate'
      - run: go build .
      - uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
          path: |
            llm/build/**/bin/*
            llm/build/**/*.a
  generate-cuda:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
@ -114,12 +108,6 @@ jobs:
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
      - uses: actions/upload-artifact@v4
        with:
          name: cuda-${{ matrix.cuda-version }}-libraries
          path: |
            llm/build/**/bin/*
            dist/windows-amd64/**
  generate-rocm:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
@ -147,12 +135,6 @@ jobs:
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
      - uses: actions/upload-artifact@v4
        with:
          name: rocm-${{ matrix.rocm-version }}-libraries
          path: |
            llm/build/**/bin/*
            dist/windows-amd64/**
  # ROCm generation step
  generate-windows-rocm:
@ -189,7 +171,6 @@ jobs:
        name: go generate
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
      # TODO - do we need any artifacts?
  # CUDA generation step
  generate-windows-cuda:
@ -231,7 +212,6 @@ jobs:
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
      # TODO - do we need any artifacts?
  lint:
    strategy:
@ -263,14 +243,6 @@ jobs:
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
      - run: |
          mkdir -p llm/build/linux/$ARCH/stub/bin
          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
      - run: |
          mkdir -p llm/build/darwin/$ARCH/stub/bin
          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
        if: ${{ startsWith(matrix.os, 'macos-') }}
      - uses: golangci/golangci-lint-action@v6
        with:
          args: --timeout 8m0s -v
@ -301,23 +273,10 @@ jobs:
          cache: true
      - run: |
          case ${{ matrix.arch }} in
-            amd64) echo ARCH=x86_64 ;;
+            amd64) echo ARCH=amd64 ;;
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
      - run: |
          mkdir -p llm/build/linux/$ARCH/stub/bin
          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
      - run: |
          mkdir -p llm/build/darwin/$ARCH/stub/bin
          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
        if: ${{ startsWith(matrix.os, 'macos-') }}
        shell: bash
      - run: go generate ./...
      - run: go build
      - run: go test -v ./...
      - uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-binaries
          path: ollama
--- a/.gitignore
+++ b/.gitignore
@ -12,4 +12,7 @@ ggml-metal.metal
 test_data
 *.crt
 llm/build
 build/*/*/*
 !build/**/placeholder
 llama/build
 __debug_bin*
--- a/103
+++ b/103
@ -47,7 +47,7 @@ RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
    bash gen_linux.sh
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@ -63,7 +63,7 @@ RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
    CUDA_VARIANT="_v11" \
    bash gen_linux.sh
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@ -143,64 +143,103 @@ RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
-# Intermediate stage used for ./scripts/build_linux.sh
+# Intermediate stages used for ./scripts/build_linux.sh
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
 ENV CGO_ENABLED=1
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
-COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-amd64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
 RUN cd dist/linux-$GOARCH-rocm && \
    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
 ENV CGO_ENABLED=1
 ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
-COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
-COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-arm64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
 FROM --platform=linux/amd64 scratch AS dist-amd64
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
 FROM --platform=linux/arm64 scratch AS dist-arm64
 COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
 FROM dist-$TARGETARCH as dist
 # Optimized container images do not cary nested payloads
 FROM --platform=linux/amd64 static-build-amd64 AS container-build-amd64
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-amd64/bin/ollama .
 FROM --platform=linux/arm64 static-build-arm64 AS container-build-arm64
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-arm64/bin/ollama .
 # Strip out ROCm dependencies to keep the primary image lean
 FROM --platform=linux/amd64 ubuntu:22.04 AS amd64-libs-without-rocm
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/
 RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa*
 # Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
-COPY --from=amd64-libs-without-rocm /scratch/ /lib/
+RUN apt-get update && \
-RUN apt-get update && apt-get install -y ca-certificates && \
+    apt-get install -y ca-certificates && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+RUN apt-get update && \
-RUN apt-get update && apt-get install -y ca-certificates && \
+    apt-get install -y ca-certificates && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
+COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
 COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-# Radeon images are much larger so we keep it distinct from the CPU/CUDA image
+# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
-FROM  rocm/dev-centos-7:${ROCM_VERSION}-complete AS runtime-rocm
+FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
-RUN update-pciids
+# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+# across releases
-RUN ln -s /opt/rocm/lib /lib/ollama
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
    apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 EXPOSE 11434
 ENV OLLAMA_HOST=0.0.0.0
--- a/build/darwin/amd64/placeholder
+++ b/build/darwin/amd64/placeholder
@ -0,0 +1 @@
 This is here to make sure the build/ directory exists for the go:embed command
--- a/build/darwin/arm64/placeholder
+++ b/build/darwin/arm64/placeholder
@ -0,0 +1 @@
 This is here to make sure the build/ directory exists for the go:embed command
--- a/build/embed_darwin_amd64.go
+++ b/build/embed_darwin_amd64.go
@ -0,0 +1,8 @@
 package build
 import "embed"
 // Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
 //go:embed darwin/amd64/*
 var EmbedFS embed.FS
--- a/build/embed_darwin_arm64.go
+++ b/build/embed_darwin_arm64.go
@ -0,0 +1,8 @@
 package build
 import "embed"
 // Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
 //go:embed darwin/arm64/*
 var EmbedFS embed.FS
--- a/build/embed_linux.go
+++ b/build/embed_linux.go
@ -0,0 +1,6 @@
 package build
 import "embed"
 //go:embed linux/*
 var EmbedFS embed.FS
--- a/build/embed_unused.go
+++ b/build/embed_unused.go
@ -0,0 +1,8 @@
 //go:build !linux && !darwin
 package build
 import "embed"
 // unused on windows
 var EmbedFS embed.FS
--- a/build/linux/amd64/placeholder
+++ b/build/linux/amd64/placeholder
@ -0,0 +1 @@
 This is here to make sure the build/ directory exists for the go:embed command
--- a/build/linux/arm64/placeholder
+++ b/build/linux/arm64/placeholder
@ -0,0 +1 @@
 This is here to make sure the build/ directory exists for the go:embed command
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -179,53 +179,6 @@ var (
 	HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
 )
 func RunnersDir() (p string) {
 	if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
 		return p
 	}
 	if runtime.GOOS != "windows" {
 		return
 	}
 	defer func() {
 		if p == "" {
 			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
 		}
 	}()
 	// On Windows we do not carry the payloads inside the main executable
 	exe, err := os.Executable()
 	if err != nil {
 		return
 	}
 	cwd, err := os.Getwd()
 	if err != nil {
 		return
 	}
 	var paths []string
 	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), LibRelativeToExe()), cwd} {
 		paths = append(paths,
 			root,
 			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
 			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
 		)
 	}
 	// Try a few variations to improve developer experience when building from source in the local tree
 	for _, path := range paths {
 		candidate := filepath.Join(path, "lib", "ollama", "runners")
 		if _, err := os.Stat(candidate); err == nil {
 			p = candidate
 			break
 		}
 	}
 	return p
 }
 func Uint(key string, defaultValue uint) func() uint {
 	return func() uint {
 		if s := Var(key); s != "" {
@ -290,7 +243,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
--- a/gpu/assets.go
+++ b/gpu/assets.go
@ -1,148 +0,0 @@
 package gpu
 import (
 	"errors"
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"syscall"
 	"time"
 	"github.com/ollama/ollama/envconfig"
 )
 var (
 	lock        sync.Mutex
 	payloadsDir = ""
 )
 func PayloadsDir() (string, error) {
 	lock.Lock()
 	defer lock.Unlock()
 	var err error
 	if payloadsDir == "" {
 		runnersDir := envconfig.RunnersDir()
 		if runnersDir != "" {
 			payloadsDir = runnersDir
 			return payloadsDir, nil
 		}
 		// The remainder only applies on non-windows where we still carry payloads in the main executable
 		cleanupTmpDirs()
 		tmpDir := envconfig.TmpDir()
 		if tmpDir == "" {
 			tmpDir, err = os.MkdirTemp("", "ollama")
 			if err != nil {
 				return "", fmt.Errorf("failed to generate tmp dir: %w", err)
 			}
 		} else {
 			err = os.MkdirAll(tmpDir, 0o755)
 			if err != nil {
 				return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
 			}
 		}
 		// Track our pid so we can clean up orphaned tmpdirs
 		n := filepath.Join(tmpDir, "ollama.pid")
 		if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
 			return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
 		}
 		// We create a distinct subdirectory for payloads within the tmpdir
 		// This will typically look like /tmp/ollama3208993108/runners on linux
 		payloadsDir = filepath.Join(tmpDir, "runners")
 	}
 	return payloadsDir, nil
 }
 // Best effort to clean up prior tmpdirs
 func cleanupTmpDirs() {
 	matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
 	if err != nil {
 		return
 	}
 	for _, match := range matches {
 		raw, err := os.ReadFile(match)
 		if errors.Is(err, os.ErrNotExist) {
 			slog.Debug("not a ollama runtime directory, skipping", "path", match)
 			continue
 		} else if err != nil {
 			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
 			continue
 		}
 		pid, err := strconv.Atoi(string(raw))
 		if err != nil {
 			slog.Warn("invalid pid, skipping", "path", match, "error", err)
 			continue
 		}
 		p, err := os.FindProcess(pid)
 		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
 			slog.Warn("process still running, skipping", "pid", pid, "path", match)
 			continue
 		}
 		if err := os.Remove(match); err != nil {
 			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
 		}
 		runners := filepath.Join(filepath.Dir(match), "runners")
 		if err := os.RemoveAll(runners); err != nil {
 			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
 		}
 		if err := os.Remove(filepath.Dir(match)); err != nil {
 			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
 		}
 	}
 }
 func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
 	runnersDir := envconfig.RunnersDir()
 	if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
 		// We want to fully clean up the tmpdir parent of the payloads dir
 		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
 		slog.Debug("cleaning up", "dir", tmpDir)
 		err := os.RemoveAll(tmpDir)
 		if err != nil {
 			// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
 			time.Sleep(1000 * time.Millisecond)
 			err = os.RemoveAll(tmpDir)
 			if err != nil {
 				slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
 			}
 		}
 	}
 }
 func UpdatePath(dir string) {
 	if runtime.GOOS == "windows" {
 		tmpDir := filepath.Dir(dir)
 		pathComponents := strings.Split(os.Getenv("PATH"), ";")
 		i := 0
 		for _, comp := range pathComponents {
 			if strings.EqualFold(comp, dir) {
 				return
 			}
 			// Remove any other prior paths to our temp dir
 			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
 				pathComponents[i] = comp
 				i++
 			}
 		}
 		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
 		slog.Info("updating", "PATH", newPath)
 		os.Setenv("PATH", newPath)
 	}
 	// linux and darwin rely on rpath
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -93,10 +93,9 @@ func initCudaHandles() *cudaHandles {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
 	}
-	tmpDir, _ := PayloadsDir()
+	libDir := LibraryDir()
-	if tmpDir != "" {
+	if libDir != "" {
-		// TODO - add "payloads" for subprocess
+		cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
 		cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
 	}
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@ -31,6 +31,7 @@ init_vars() {
        NO_WHOLE_ARCHIVE=""
        GCC_ARCH="-arch ${ARCH}"
        DIST_BASE=../../dist/darwin-${GOARCH}/
        PAYLOAD_BASE=../../build/darwin/${GOARCH}
        ;;
    "Linux")
        LIB_EXT="so"
@ -40,6 +41,7 @@ init_vars() {
        # Cross compiling not supported on linux - Use docker
        GCC_ARCH=""
        DIST_BASE=../../dist/linux-${GOARCH}/
        PAYLOAD_BASE=../../build/linux/${GOARCH}
        ;;
    *)
        ;;
@ -47,7 +49,8 @@ init_vars() {
    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
    fi
-    GZIP=$(which pigz 2>/dev/null || echo "gzip")
+    GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
    RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
 }
 git_module_setup() {
@ -91,17 +94,34 @@ build() {
    rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
 }
-compress() {
+dist() {
-    echo "Compressing payloads to reduce overall binary size..."
+    [ -z "${RUNNER}" ] && exit 1
-    rm -rf ${BUILD_DIR}/bin/*.gz
+    mkdir -p ${RUNNER_BASE}/${RUNNER}/
    for f in ${BUILD_DIR}/bin/* ; do
-        ${GZIP} -n --best -f ${f} &
+        cp ${f} ${RUNNER_BASE}/${RUNNER}/
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
            cp ${f} ${RUNNER_BASE}/${RUNNER}/
        done
    fi
 }
 # Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
 compress() {
    [ -z "${RUNNER}" ] && exit 1
    echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
    rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
    mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
    for f in ${BUILD_DIR}/bin/* ; do
        ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
        compress_pids+=" $!"
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
-            ${GZIP} -n --best -f ${f} &
+            ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
            compress_pids+=" $!"
        done
    fi
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@ -39,7 +39,8 @@ case "${GOARCH}" in
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/cpu"
+        RUNNER=cpu
        BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}"
        echo "Building LCD CPU"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
@ -51,7 +52,8 @@ case "${GOARCH}" in
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
+        RUNNER=cpu_avx
        BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}"
        echo "Building AVX CPU"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
@ -63,7 +65,8 @@ case "${GOARCH}" in
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
+        RUNNER=cpu_avx2
        BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}"
        echo "Building AVX2 CPU"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
        build
@ -84,7 +87,8 @@ case "${GOARCH}" in
    if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
        init_vars
        CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/metal"
+        RUNNER="metal"
        BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -79,10 +79,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
        init_vars
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        BUILD_DIR="../build/linux/${ARCH}/cpu"
+        RUNNER="cpu"
        BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
        echo "Building custom CPU"
        build
        install
        dist
        compress
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@ -102,10 +104,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
            #
            init_vars
            CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-            BUILD_DIR="../build/linux/${ARCH}/cpu"
+            RUNNER=cpu
            BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
            echo "Building LCD CPU"
            build
            install
            dist
            compress
        fi
@ -120,10 +124,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
+                RUNNER=cpu_avx
                BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
                echo "Building AVX CPU"
                build
                install
                dist
                compress
            fi
@ -134,10 +140,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
+                RUNNER=cpu_avx2
                BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
                echo "Building AVX2 CPU"
                build
                install
                dist
                compress
            fi
        fi
@ -187,11 +195,13 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
    fi
    export CUDAFLAGS="-t8"
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
-    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
+    RUNNER=cuda${CUDA_VARIANT}
    BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
    build
    install
    dist
    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
    mkdir -p "${CUDA_DIST_DIR}"
    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
@ -212,7 +222,8 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
    CC=icx
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
-    BUILD_DIR="../build/linux/${ARCH}/oneapi"
+    RUNNER=oneapi
    BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
@ -231,6 +242,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
    install
    dist
    compress
 fi
@ -259,7 +271,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
        echo "Building custom ROCM GPU"
    fi
-    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
+    RUNNER=rocm${ROCM_VARIANT}
    BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
    # ROCm dependencies are too large to fit into a unified bundle
    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
    # TODO figure out how to disable runpath (rpath)
@ -269,13 +282,17 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
    # copy the ROCM dependencies
    mkdir -p "${ROCM_DIST_DIR}"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
        cp -a "${dep}"* "${ROCM_DIST_DIR}"
        if [ $(readlink -f "${dep}") != "${dep}" ] ; then
            cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
        fi
    done
    install
    dist
    compress
 fi
 cleanup
 wait_for_compress
-echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
+echo "go generate completed.  LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
@ -1,11 +1,7 @@
 package llm
 import (
 	"embed"
 	"syscall"
 )
 //go:embed build/darwin/arm64/*/bin/*
 var libEmbed embed.FS
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
--- a/llm/llm_darwin_amd64.go
+++ b/llm/llm_darwin_amd64.go
@ -1,11 +0,0 @@
 package llm
 import (
 	"embed"
 	"syscall"
 )
 //go:embed build/darwin/x86_64/*/bin/*
 var libEmbed embed.FS
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
--- a/llm/llm_linux.go
+++ b/llm/llm_linux.go
@ -1,11 +1,7 @@
 package llm
 import (
 	"embed"
 	"syscall"
 )
 //go:embed build/linux/*/*/bin/*
 var libEmbed embed.FS
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@ -1,13 +1,9 @@
 package llm
 import (
 	"embed"
 	"syscall"
 )
 // unused on windows
 var libEmbed embed.FS
 const CREATE_DEFAULT_ERROR_MODE = 0x04000000
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{
--- a/llm/payload.go
+++ b/llm/payload.go
@ -1,233 +0,0 @@
 package llm
 import (
 	"compress/gzip"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"slices"
 	"strings"
 	"golang.org/x/sync/errgroup"
 	"github.com/ollama/ollama/gpu"
 )
 var errPayloadMissing = errors.New("expected payloads not included in this build of ollama")
 func Init() error {
 	payloadsDir, err := gpu.PayloadsDir()
 	if err != nil {
 		return err
 	}
 	if runtime.GOOS != "windows" {
 		slog.Info("extracting embedded files", "dir", payloadsDir)
 		binGlob := "build/*/*/*/bin/*"
 		// extract server libraries
 		err = extractFiles(payloadsDir, binGlob)
 		if err != nil {
 			return fmt.Errorf("extract binaries: %v", err)
 		}
 	}
 	var variants []string
 	for v := range getAvailableServers() {
 		variants = append(variants, v)
 	}
 	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
 	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
 	return nil
 }
 // binary names may contain an optional variant separated by '_'
 // For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
 // Any library without a variant is the lowest common denominator
 func getAvailableServers() map[string]string {
 	payloadsDir, err := gpu.PayloadsDir()
 	if err != nil {
 		slog.Error("payload lookup error", "error", err)
 		return nil
 	}
 	// glob payloadsDir for files that start with ollama_
 	pattern := filepath.Join(payloadsDir, "*", "ollama_*")
 	files, err := filepath.Glob(pattern)
 	if err != nil {
 		slog.Debug("could not glob", "pattern", pattern, "error", err)
 		return nil
 	}
 	servers := make(map[string]string)
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
 		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
 	}
 	return servers
 }
 // serversForGpu returns a list of compatible servers give the provided GPU
 // info, ordered by performance. assumes Init() has been called
 // TODO - switch to metadata based mapping
 func serversForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := getAvailableServers()
 	requested := info.Library
 	if info.Variant != gpu.CPUCapabilityNone.String() {
 		requested += "_" + info.Variant
 	}
 	servers := []string{}
 	// exact match first
 	for a := range availableServers {
 		if a == requested {
 			servers = []string{a}
 			if a == "metal" {
 				return servers
 			}
 			break
 		}
 	}
 	alt := []string{}
 	// Then for GPUs load alternates and sort the list for consistent load ordering
 	if info.Library != "cpu" {
 		for a := range availableServers {
 			if info.Library == strings.Split(a, "_")[0] && a != requested {
 				alt = append(alt, a)
 			}
 		}
 		slices.Sort(alt)
 		servers = append(servers, alt...)
 	}
 	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
 		// Load up the best CPU variant if not primary requested
 		if info.Library != "cpu" {
 			variant := gpu.GetCPUCapability()
 			// If no variant, then we fall back to default
 			// If we have a variant, try that if we find an exact match
 			// Attempting to run the wrong CPU instructions will panic the
 			// process
 			if variant != gpu.CPUCapabilityNone {
 				for cmp := range availableServers {
 					if cmp == "cpu_"+variant.String() {
 						servers = append(servers, cmp)
 						break
 					}
 				}
 			} else {
 				servers = append(servers, "cpu")
 			}
 		}
 		if len(servers) == 0 {
 			servers = []string{"cpu"}
 		}
 	}
 	return servers
 }
 // Return the optimal server for this CPU architecture
 func serverForCpu() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		return "metal"
 	}
 	variant := gpu.GetCPUCapability()
 	availableServers := getAvailableServers()
 	if variant != gpu.CPUCapabilityNone {
 		for cmp := range availableServers {
 			if cmp == "cpu_"+variant.String() {
 				return cmp
 			}
 		}
 	}
 	return "cpu"
 }
 // extract extracts the embedded files to the target directory
 func extractFiles(targetDir string, glob string) error {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return errPayloadMissing
 	}
 	if err := os.MkdirAll(targetDir, 0o755); err != nil {
 		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
 	}
 	g := new(errgroup.Group)
 	// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
 	for _, file := range files {
 		filename := file
 		variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
 		slog.Debug("extracting", "variant", variant, "file", filename)
 		g.Go(func() error {
 			srcf, err := libEmbed.Open(filename)
 			if err != nil {
 				return err
 			}
 			defer srcf.Close()
 			src := io.Reader(srcf)
 			if strings.HasSuffix(filename, ".gz") {
 				src, err = gzip.NewReader(src)
 				if err != nil {
 					return fmt.Errorf("decompress payload %s: %v", filename, err)
 				}
 				filename = strings.TrimSuffix(filename, ".gz")
 			}
 			variantDir := filepath.Join(targetDir, variant)
 			if err := os.MkdirAll(variantDir, 0o755); err != nil {
 				return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
 			}
 			base := filepath.Base(filename)
 			destFilename := filepath.Join(variantDir, base)
 			_, err = os.Stat(destFilename)
 			switch {
 			case errors.Is(err, os.ErrNotExist):
 				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 				if err != nil {
 					return fmt.Errorf("write payload %s: %v", filename, err)
 				}
 				defer destFile.Close()
 				if _, err := io.Copy(destFile, src); err != nil {
 					return fmt.Errorf("copy payload %s: %v", filename, err)
 				}
 			case err != nil:
 				return fmt.Errorf("stat payload %s: %v", filename, err)
 			}
 			return nil
 		})
 	}
 	err = g.Wait()
 	if err != nil {
 		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
 		gpu.Cleanup()
 		return err
 	}
 	return nil
 }
--- a/llm/server.go
+++ b/llm/server.go
@ -24,9 +24,11 @@ import (
 	"golang.org/x/sync/semaphore"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/runners"
 )
 type LlamaServer interface {
@ -106,7 +108,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		gpus = gpu.GetCPUInfo()
 	}
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
-		cpuRunner = serverForCpu()
+		cpuRunner = runners.ServerForCpu()
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
@ -118,7 +120,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			opts.NumGPU = 0
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
-			cpuRunner = serverForCpu()
+			cpuRunner = runners.ServerForCpu()
 			gpus = gpu.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
@ -145,25 +147,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
 	}
-	availableServers := getAvailableServers()
+	rDir, err := runners.Refresh(build.EmbedFS)
 	if err != nil {
 		return nil, err
 	}
 	availableServers := runners.GetAvailableServers(rDir)
 	if len(availableServers) == 0 {
-		if runtime.GOOS != "windows" {
+		return nil, finalErr
 			slog.Warn("llama server binary disappeared, reinitializing payloads")
 			err = Init()
 			if err != nil {
 				slog.Warn("failed to reinitialize payloads", "error", err)
 				return nil, err
 			}
 			availableServers = getAvailableServers()
 		} else {
 			return nil, finalErr
 		}
 	}
 	var servers []string
 	if cpuRunner != "" {
 		servers = []string{cpuRunner}
 	} else {
-		servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
+		servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
 	}
 	demandLib := envconfig.LLMLibrary()
 	if demandLib != "" {
@ -330,7 +327,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		_, err := os.Stat(server)
 		if errors.Is(err, os.ErrNotExist) {
 			slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
-			err = Init()
+			_, err = runners.Refresh(build.EmbedFS)
 			if err != nil {
 				slog.Warn("failed to reinitialize payloads", "error", err)
 				return nil, err
--- a/runners/common.go
+++ b/runners/common.go
@ -0,0 +1,384 @@
 package runners
 import (
 	"compress/gzip"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"slices"
 	"strconv"
 	"strings"
 	"sync"
 	"syscall"
 	"golang.org/x/sync/errgroup"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/gpu"
 )
 const (
 	binGlob = "*/*/*/*"
 )
 var (
 	lock       sync.Mutex
 	runnersDir = ""
 )
 // Return the location where runners are stored
 // If runners are payloads, this will either extract them
 // or refresh them if any have disappeared due to tmp cleaners
 func Refresh(payloadFS fs.FS) (string, error) {
 	lock.Lock()
 	defer lock.Unlock()
 	var err error
 	// Wire up extra logging on our first load
 	if runnersDir == "" {
 		defer func() {
 			var runners []string
 			for v := range GetAvailableServers(runnersDir) {
 				runners = append(runners, v)
 			}
 			slog.Info("Dynamic LLM libraries", "runners", runners)
 			slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
 		}()
 	}
 	if hasPayloads(payloadFS) {
 		if runnersDir == "" {
 			runnersDir, err = extractRunners(payloadFS)
 		} else {
 			err = refreshRunners(payloadFS, runnersDir)
 		}
 	} else if runnersDir == "" {
 		runnersDir, err = locateRunners()
 	}
 	return runnersDir, err
 }
 func Cleanup(payloadFS fs.FS) {
 	lock.Lock()
 	defer lock.Unlock()
 	if hasPayloads(payloadFS) && runnersDir != "" {
 		// We want to fully clean up the tmpdir parent of the payloads dir
 		tmpDir := filepath.Clean(filepath.Join(runnersDir, ".."))
 		slog.Debug("cleaning up", "dir", tmpDir)
 		err := os.RemoveAll(tmpDir)
 		if err != nil {
 			slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
 		}
 	}
 }
 func locateRunners() (string, error) {
 	exe, err := os.Executable()
 	if err != nil {
 		return "", err
 	}
 	cwd, err := os.Getwd()
 	if err != nil {
 		return "", err
 	}
 	var paths []string
 	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} {
 		paths = append(paths,
 			root,
 			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
 			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
 		)
 	}
 	// Try a few variations to improve developer experience when building from source in the local tree
 	for _, path := range paths {
 		candidate := filepath.Join(path, "lib", "ollama", "runners")
 		if _, err := os.Stat(candidate); err == nil {
 			return candidate, nil
 		}
 	}
 	return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
 }
 // Return true if we're carying nested payloads for the runners
 func hasPayloads(payloadFS fs.FS) bool {
 	files, err := fs.Glob(payloadFS, binGlob)
 	if err != nil || len(files) == 0 || (len(files) == 1 && strings.Contains(files[0], "placeholder")) {
 		return false
 	}
 	return true
 }
 func extractRunners(payloadFS fs.FS) (string, error) {
 	cleanupTmpDirs()
 	tmpDir, err := os.MkdirTemp(envconfig.TmpDir(), "ollama")
 	if err != nil {
 		return "", fmt.Errorf("failed to generate tmp dir: %w", err)
 	}
 	// Track our pid so we can clean up orphaned tmpdirs
 	n := filepath.Join(tmpDir, "ollama.pid")
 	if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
 		slog.Warn("failed to write pid file", "file", n, "error", err)
 	}
 	// We create a distinct subdirectory for payloads within the tmpdir
 	// This will typically look like /tmp/ollama3208993108/runners on linux
 	rDir := filepath.Join(tmpDir, "runners")
 	slog.Info("extracting embedded files", "dir", rDir)
 	return rDir, refreshRunners(payloadFS, rDir)
 }
 func refreshRunners(payloadFS fs.FS, rDir string) error {
 	// extract or refresh server libraries
 	err := extractFiles(payloadFS, rDir, binGlob)
 	if err != nil {
 		return fmt.Errorf("extract binaries: %v", err)
 	}
 	return nil
 }
 // extract extracts the embedded files to the target directory
 func extractFiles(payloadFS fs.FS, targetDir string, glob string) error {
 	files, err := fs.Glob(payloadFS, glob)
 	if err != nil || len(files) == 0 {
 		// Should not happen
 		return fmt.Errorf("extractFiles called without payload present")
 	}
 	if err := os.MkdirAll(targetDir, 0o755); err != nil {
 		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
 	}
 	g := new(errgroup.Group)
 	// $OS/$GOARCH/$RUNNER/$FILE
 	for _, file := range files {
 		filename := file
 		runner := filepath.Base(filepath.Dir(filename))
 		slog.Debug("extracting", "runner", runner, "payload", filename)
 		g.Go(func() error {
 			srcf, err := payloadFS.Open(filename)
 			if err != nil {
 				return err
 			}
 			defer srcf.Close()
 			src := io.Reader(srcf)
 			if strings.HasSuffix(filename, ".gz") {
 				src, err = gzip.NewReader(src)
 				if err != nil {
 					return fmt.Errorf("decompress payload %s: %v", filename, err)
 				}
 				filename = strings.TrimSuffix(filename, ".gz")
 			}
 			runnerDir := filepath.Join(targetDir, runner)
 			if err := os.MkdirAll(runnerDir, 0o755); err != nil {
 				return fmt.Errorf("extractFiles could not mkdir %s: %v", runnerDir, err)
 			}
 			base := filepath.Base(filename)
 			destFilename := filepath.Join(runnerDir, base)
 			_, err = os.Stat(destFilename)
 			switch {
 			case errors.Is(err, os.ErrNotExist):
 				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 				if err != nil {
 					return fmt.Errorf("write payload %s: %v", filename, err)
 				}
 				defer destFile.Close()
 				if _, err := io.Copy(destFile, src); err != nil {
 					return fmt.Errorf("copy payload %s: %v", filename, err)
 				}
 			case err != nil:
 				return fmt.Errorf("stat payload %s: %v", filename, err)
 			}
 			return nil
 		})
 	}
 	err = g.Wait()
 	if err != nil {
 		slog.Error("failed to extract files", "error", err)
 		// If we fail to extract, the payload dir is most likely unusable, so cleanup whatever we extracted
 		err := os.RemoveAll(targetDir)
 		if err != nil {
 			slog.Warn("failed to cleanup incomplete payload dir", "dir", targetDir, "error", err)
 		}
 		return err
 	}
 	return nil
 }
 // Best effort to clean up prior tmpdirs
 func cleanupTmpDirs() {
 	tmpDir := envconfig.TmpDir()
 	if tmpDir == "" {
 		tmpDir = os.TempDir()
 	}
 	matches, err := filepath.Glob(filepath.Join(tmpDir, "ollama*", "ollama.pid"))
 	if err != nil {
 		return
 	}
 	for _, match := range matches {
 		raw, err := os.ReadFile(match)
 		if errors.Is(err, os.ErrNotExist) {
 			slog.Debug("not a ollama runtime directory, skipping", "path", match)
 			continue
 		} else if err != nil {
 			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
 			continue
 		}
 		pid, err := strconv.Atoi(string(raw))
 		if err != nil {
 			slog.Warn("invalid pid, skipping", "path", match, "error", err)
 			continue
 		}
 		p, err := os.FindProcess(pid)
 		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
 			slog.Warn("process still running, skipping", "pid", pid, "path", match)
 			continue
 		}
 		if err := os.Remove(match); err != nil {
 			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
 		}
 		runners := filepath.Join(filepath.Dir(match), "runners")
 		if err := os.RemoveAll(runners); err != nil {
 			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
 		}
 		if err := os.Remove(filepath.Dir(match)); err != nil {
 			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
 		}
 	}
 }
 // directory names are the name of the runner and may contain an optional
 // variant prefixed with '_' as the separator. For example, "cuda_v11" and
 // "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
 // lowest common denominator
 func GetAvailableServers(payloadsDir string) map[string]string {
 	if payloadsDir == "" {
 		slog.Error("empty runner dir")
 		return nil
 	}
 	// glob payloadsDir for files that start with ollama_
 	pattern := filepath.Join(payloadsDir, "*", "ollama_*")
 	files, err := filepath.Glob(pattern)
 	if err != nil {
 		slog.Debug("could not glob", "pattern", pattern, "error", err)
 		return nil
 	}
 	servers := make(map[string]string)
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
 		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
 	}
 	return servers
 }
 // serversForGpu returns a list of compatible servers give the provided GPU
 // info, ordered by performance. assumes Init() has been called
 // TODO - switch to metadata based mapping
 func ServersForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := GetAvailableServers(runnersDir)
 	requested := info.Library
 	if info.Variant != gpu.CPUCapabilityNone.String() {
 		requested += "_" + info.Variant
 	}
 	servers := []string{}
 	// exact match first
 	for a := range availableServers {
 		if a == requested {
 			servers = []string{a}
 			if a == "metal" {
 				return servers
 			}
 			break
 		}
 	}
 	alt := []string{}
 	// Then for GPUs load alternates and sort the list for consistent load ordering
 	if info.Library != "cpu" {
 		for a := range availableServers {
 			if info.Library == strings.Split(a, "_")[0] && a != requested {
 				alt = append(alt, a)
 			}
 		}
 		slices.Sort(alt)
 		servers = append(servers, alt...)
 	}
 	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
 		// Load up the best CPU variant if not primary requested
 		if info.Library != "cpu" {
 			variant := gpu.GetCPUCapability()
 			// If no variant, then we fall back to default
 			// If we have a variant, try that if we find an exact match
 			// Attempting to run the wrong CPU instructions will panic the
 			// process
 			if variant != gpu.CPUCapabilityNone {
 				for cmp := range availableServers {
 					if cmp == "cpu_"+variant.String() {
 						servers = append(servers, cmp)
 						break
 					}
 				}
 			} else {
 				servers = append(servers, "cpu")
 			}
 		}
 		if len(servers) == 0 {
 			servers = []string{"cpu"}
 		}
 	}
 	return servers
 }
 // Return the optimal server for this CPU architecture
 func ServerForCpu() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		return "metal"
 	}
 	variant := gpu.GetCPUCapability()
 	availableServers := GetAvailableServers(runnersDir)
 	if variant != gpu.CPUCapabilityNone {
 		for cmp := range availableServers {
 			if cmp == "cpu_"+variant.String() {
 				return cmp
 			}
 		}
 	}
 	return "cpu"
 }
--- a/runners/runners_test.go
+++ b/runners/runners_test.go
@ -0,0 +1,50 @@
 package runners
 import (
 	"log/slog"
 	"os"
 	"path"
 	"runtime"
 	"strings"
 	"testing"
 	"testing/fstest"
 )
 func TestRefreshRunners(t *testing.T) {
 	slog.SetLogLoggerLevel(slog.LevelDebug)
 	payloadFS := fstest.MapFS{
 		path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")},
 	}
 	tmpDir, err := os.MkdirTemp("", "testing")
 	if err != nil {
 		t.Fatalf("failed to make tmp dir %s", err)
 	}
 	t.Setenv("OLLAMA_TMPDIR", tmpDir)
 	rDir, err := Refresh(payloadFS)
 	if err != nil {
 		t.Fatalf("failed to extract to %s %s", tmpDir, err)
 	}
 	if !strings.Contains(rDir, tmpDir) {
 		t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
 	}
 	// spot check results
 	servers := GetAvailableServers(rDir)
 	if len(servers) < 1 {
 		t.Fatalf("expected at least 1 server")
 	}
 	// Refresh contents
 	rDir, err = extractRunners(payloadFS)
 	if err != nil {
 		t.Fatalf("failed to extract to %s %s", tmpDir, err)
 	}
 	if !strings.Contains(rDir, tmpDir) {
 		t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
 	}
 	cleanupTmpDirs()
 	Cleanup(payloadFS)
 }
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@ -2,8 +2,7 @@
 set -e
-export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
+. $(dirname $0)/env.sh
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
 mkdir -p dist
--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
@ -2,76 +2,34 @@
 set -eu
-export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
+. $(dirname $0)/env.sh
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
 # We use 2 different image repositories to handle combining architecture images into multiarch manifest
 # (The ROCm image is x86 only and is not a multiarch manifest)
 # For developers, you can override the DOCKER_ORG to generate multiarch manifests
 #  DOCKER_ORG=jdoe PUSH=1 ./scripts/build_docker.sh
 DOCKER_ORG=${DOCKER_ORG:-"ollama"}
 RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
 FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
 BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 # Set PUSH to a non-empty string to trigger push instead of load
 PUSH=${PUSH:-""}
 # In CI mode, we break things down
 OLLAMA_SKIP_MANIFEST_CREATE=${OLLAMA_SKIP_MANIFEST_CREATE:-""}
 OLLAMA_SKIP_IMAGE_BUILD=${OLLAMA_SKIP_IMAGE_BUILD:-""}
 if [ -z "${PUSH}" ] ; then
    echo "Building ${FINAL_IMAGE_REPO}:$VERSION locally.  set PUSH=1 to push"
    LOAD_OR_PUSH="--load"
 else
-    echo "Will be pushing ${RELEASE_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}"
+    echo "Will be pushing ${FINAL_IMAGE_REPO}:$VERSION"
    LOAD_OR_PUSH="--push"
 fi
-if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then
+docker buildx build \
-    for TARGETARCH in ${BUILD_ARCH}; do
+    ${LOAD_OR_PUSH} \
-        docker build \
+    --platform=${PLATFORM} \
-            ${LOAD_OR_PUSH} \
+    ${OLLAMA_COMMON_BUILD_ARGS} \
-            --platform=linux/${TARGETARCH} \
+    -f Dockerfile \
-            --build-arg=VERSION \
+    -t ${FINAL_IMAGE_REPO}:$VERSION \
-            --build-arg=GOFLAGS \
+    .
            -f Dockerfile \
            -t ${RELEASE_IMAGE_REPO}:$VERSION-${TARGETARCH} \
            .
    done
-    if echo ${BUILD_ARCH} | grep "amd64" > /dev/null; then
+if echo $PLATFORM | grep "amd64" > /dev/null; then
-        docker build \
+    docker buildx build \
-            ${LOAD_OR_PUSH} \
+        ${LOAD_OR_PUSH} \
-            --platform=linux/amd64 \
+        --platform=linux/amd64 \
-            --build-arg=VERSION \
+        ${OLLAMA_COMMON_BUILD_ARGS} \
-            --build-arg=GOFLAGS \
+        --target runtime-rocm \
-            --target runtime-rocm \
+        -f Dockerfile \
-            -f Dockerfile \
+        -t ${FINAL_IMAGE_REPO}:$VERSION-rocm \
-            -t ${RELEASE_IMAGE_REPO}:$VERSION-rocm \
+        .
            .
    fi
 fi
 if [ -z "${OLLAMA_SKIP_MANIFEST_CREATE}" ]; then
    if [ -n "${PUSH}" ]; then
        docker manifest create ${FINAL_IMAGE_REPO}:$VERSION \
            ${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
            ${RELEASE_IMAGE_REPO}:$VERSION-arm64
        docker manifest push ${FINAL_IMAGE_REPO}:$VERSION
        # For symmetry, tag/push the rocm image
        if [ "${RELEASE_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then
            echo "Tagging and pushing rocm image"
            docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
            docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm
            docker push ${FINAL_IMAGE_REPO}:$VERSION-rocm
        fi
    else
        echo "Skipping manifest generation when not pushing images are available locally as "
        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-amd64"
        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-arm64"
        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-rocm"
    fi
 fi
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@ -1,37 +1,29 @@
 #!/bin/sh
 #
 # Mac ARM users, rosetta can be flaky, so to use a remote x86 builder
 #
 # docker context create amd64 --docker host=ssh://mybuildhost
 # docker buildx create --name mybuilder amd64 --platform linux/amd64
 # docker buildx create --name mybuilder --append desktop-linux --platform linux/arm64
 # docker buildx use mybuilder
 set -eu
-export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
+. $(dirname $0)/env.sh
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
 GZIP=$(which pigz 2>/dev/null || echo "gzip")
 BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
 export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
 mkdir -p dist
-for TARGETARCH in ${BUILD_ARCH}; do
+docker buildx build \
-    docker build \
+        --output type=local,dest=./dist/ \
-        --platform=linux/$TARGETARCH \
+        --platform=${PLATFORM} \
-        --build-arg=GOFLAGS \
+        ${OLLAMA_COMMON_BUILD_ARGS} \
-        --build-arg=CGO_CFLAGS \
+        --target dist \
        --build-arg=OLLAMA_CUSTOM_CPU_DEFS \
        --build-arg=AMDGPU_TARGETS \
        --target build-$TARGETARCH \
        -f Dockerfile \
        -t builder:$TARGETARCH \
        .
-    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
+
-    rm -rf ./dist/linux-$TARGETARCH
+# buildx behavior changes for single vs. multiplatform
-    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
+if echo $PLATFORM | grep "," > /dev/null ; then 
-    if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
+        mv -f ./dist/linux_*64/ollama* ./dist/
-        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
+        rmdir ./dist/linux_*64
-    fi
+fi
    docker rm builder-$TARGETARCH
    echo "Compressing final linux bundle..."
    rm -f ./dist/ollama-linux-$TARGETARCH.tgz
    (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
    if [ -d dist/linux-$TARGETARCH-rocm ]; then
        (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
    fi
 done
--- a/scripts/env.sh
+++ b/scripts/env.sh
@ -0,0 +1,14 @@
 # Common environment setup across build*.sh scripts
 export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
 # TODO - consider `docker buildx ls --format=json` to autodiscover platform capability
 PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"}
 DOCKER_ORG=${DOCKER_ORG:-"ollama"}
 RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
 FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
 OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION --build-arg=GOFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS --build-arg=AMDGPU_TARGETS"
 echo "Building Ollama"
 echo "VERSION=$VERSION"
 echo "PLATFORM=$PLATFORM"
--- a/server/routes.go
+++ b/server/routes.go
@ -26,11 +26,13 @@ import (
 	"golang.org/x/sync/errgroup"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/runners"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
@ -1216,12 +1218,12 @@ func Serve(ln net.Listener) error {
 		srvr.Close()
 		schedDone()
 		sched.unloadAllRunners()
-		gpu.Cleanup()
+		runners.Cleanup(build.EmbedFS)
 		done()
 	}()
-	if err := llm.Init(); err != nil {
+	if _, err := runners.Refresh(build.EmbedFS); err != nil {
-		return fmt.Errorf("unable to initialize llm library %w", err)
+		return fmt.Errorf("unable to initialize llm runners %w", err)
 	}
 	s.sched.Run(schedCtx)
		`@ -0,0 +1 @@`
							`This is here to make sure the build/ directory exists for the go:embed command`