Remove submodule and shift to Go server - 0.4.0 (#7157)

* Remove llama.cpp submodule and shift new build to top * CI: install msys and clang gcc on win Needed for deepseek to work properly on windows
2024-10-30 10:34:28 -07:00 · 2024-10-30 10:34:28 -07:00 · b754f5a6a3
commit b754f5a6a3
parent a805e5947e
40 changed files with 366 additions and 15260 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -3,9 +3,7 @@ ollama
 app
 macapp
 dist
 llm/llama.cpp
 .env
 .cache
 test_data
 llm/build
 llama/build
--- a/.gitattributes
+++ b/.gitattributes
@ -1,4 +1,3 @@
 llm/ext_server/* linguist-vendored
 llama/**/*.cpp linguist-vendored
 llama/**/*.hpp linguist-vendored
 llama/**/*.h linguist-vendored
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -48,8 +48,8 @@ jobs:
        with:
          name: dist-darwin
          path: |
-            dist/*arwin*
+            dist/Ollama-darwin.zip
-            !dist/*-cov
+            dist/ollama-darwin
  # Windows builds take a long time to both install the dependencies and build, so parallelize
  # CPU generation step
@ -85,6 +85,24 @@ jobs:
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
      - name: Install msys2
        run: |
          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
          write-host "Downloading msys2"
          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
          write-host "Installing msys2"
          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Install msys2 tools
        run: |
          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: verify tools
        run: |
          get-command gcc
          gcc --version
          get-command make
          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@ -92,19 +110,19 @@ jobs:
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
-          go generate -x ./...
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
-        name: go generate
+          make -j $cores
        name: make
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cpu
          path: |
            build/**/*
            build/**/*.a
            llm/build/**/*.a
            dist/windows-amd64/**
  # ROCm generation step
@ -140,6 +158,24 @@ jobs:
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
      - name: Install msys2
        run: |
          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
          write-host "Downloading msys2"
          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
          write-host "Installing msys2"
          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Install msys2 tools
        run: |
          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: verify tools
        run: |
          get-command gcc
          gcc --version
          get-command make
          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@ -158,31 +194,21 @@ jobs:
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          go generate -x ./...
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
-        name: go generate
+          make -j $cores
-      - name: 'gather rocm dependencies'
+        name: make
        run: |
          $HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          md "dist\deps\bin\rocblas\library"
          cp "${HIP_PATH}\bin\hipblas.dll" "dist\deps\bin\"
          cp "${HIP_PATH}\bin\rocblas.dll" "dist\deps\bin\"
          cp "${HIP_PATH}\bin\rocblas\library\*" "dist\deps\bin\rocblas\library\"
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-rocm
          path: |
            build/**/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
          name: windows-rocm-deps
          path: dist/deps/*
  # CUDA generation step
  generate-windows-cuda:
@ -224,6 +250,24 @@ jobs:
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
      - name: Install msys2
        run: |
          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
          write-host "Downloading msys2"
          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
          write-host "Installing msys2"
          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Install msys2 tools
        run: |
          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: verify tools
        run: |
          get-command gcc
          gcc --version
          get-command make
          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@ -245,34 +289,23 @@ jobs:
      - name: 'Verify CUDA'
        run: nvcc -V
      - run: go get ./...
-      - name: go generate
+      - name: make
        run: |
          $gopath=(get-command go).source | split-path -parent
          $cudabin=(get-command nvcc).source | split-path
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$cudabin;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          go generate -x ./...
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
-      - name: 'gather cuda dependencies'
+          make -j $cores
        run: |
          $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
          md "dist\deps"
          cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\"
          cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\"
          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cuda-${{ matrix.cuda.version }}
          path: |
            build/**/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
          name: windows-cuda-deps-${{ matrix.cuda.version }}
          path: dist/deps/*
  # windows arm64 generate, go build, and zip file (no installer)
  # Output of this build is aggregated into the final x86 build
@ -292,6 +325,30 @@ jobs:
          choco install -y --no-progress git gzip
          echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      # pacman is buggy on win arm64, so we avoid using it, but rely on the binary artifacts
      # we download the sfx (7zip bundle) which isn't fully set up, but the binaries we need to build work
      - name: Install msys2 x64
        run: |
          $url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-base-x86_64-20240727.sfx.exe"
          write-host "Downloading MSYS2"
          Invoke-WebRequest -Uri "$url" -outfile "${env:RUNNER_TEMP}\msys2.exe"
          write-host "Installing msys2"
          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @(
              '-y', '-oC:\'
              ) -NoNewWindow -Wait
          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      # since pacman isn't reliable, we just download the tar file and extract directly
      - name: Downloading and extracting msys2 make tar file
        run: |
          $url="https://mirror.msys2.org/msys/x86_64/make-4.4.1-2-x86_64.pkg.tar.zst"
          write-host "Downloading make"
          Invoke-WebRequest -Uri "$url" -outfile c:\msys64\make.tar.zst
          cd c:\msys64; tar -xf make.tar.zst
          rm c:\msys64\make.tar.zst
      - name: Verify Make works properly
        run: |
          echo $env:PATH
          make --version
      - name: Install Visual Studio 2022
        run: |
          $components = @(
@ -385,10 +442,9 @@ jobs:
      - run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\Program Files\Microsoft Visual Studio\2022\Community' -skipautomaticlocation
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$gccpath;$env:PATH"
          $env:PATH="$gopath;$gccpath;$env:PATH;C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin"
          echo $env:PATH
          $env:ARCH="arm64"
          .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies distZip
@ -441,6 +497,24 @@ jobs:
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
      - name: Install msys2
        run: |
          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
          write-host "Downloading msys2"
          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
          write-host "Installing msys2"
          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Install msys2 tools
        run: |
          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: verify tools
        run: |
          get-command gcc
          gcc --version
          get-command make
          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@ -455,15 +529,6 @@ jobs:
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-cuda-12
      - uses: actions/download-artifact@v4
        with:
          name: windows-cuda-deps-11
      - uses: actions/download-artifact@v4
        with:
          name: windows-cuda-deps-12
      - uses: actions/download-artifact@v4
        with:
          name: windows-rocm-deps
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
@ -474,11 +539,12 @@ jobs:
      - run: dir build
      - run: |
          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_GENERATE="1"
          $env:ARCH="amd64"
          & .\scripts\build_windows.ps1
      - uses: actions/upload-artifact@v4
        with:
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -21,9 +21,6 @@ jobs:
  changes:
    runs-on: ubuntu-latest
    outputs:
      GENERATE: ${{ steps.changes.outputs.GENERATE }}
      GENERATE_CUDA: ${{ steps.changes.outputs.GENERATE_CUDA }}
      GENERATE_ROCM: ${{ steps.changes.outputs.GENERATE_ROCM }}
      RUNNERS: ${{ steps.changes.outputs.RUNNERS }}
    steps:
      - uses: actions/checkout@v4
@ -39,53 +36,12 @@ jobs:
          }
          {
            echo GENERATE=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
            echo GENERATE_CUDA=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
            echo GENERATE_ROCM=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
            echo RUNNERS=$(changed 'llama/**')
          } >>$GITHUB_OUTPUT
-  generate:
+  runners-linux-cuda:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-2019]
        arch: [amd64, arm64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
          - os: windows-2019
            arch: arm64
    runs-on: ${{ matrix.os }}
    env:
      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: '1'
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$gccpath;$env:PATH"
          echo $env:PATH
          go generate -x ./...
        if: ${{ startsWith(matrix.os, 'windows-') }}
        name: 'Windows Go Generate'
      - run: go generate -x ./...
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
        name: 'Unix Go Generate'
      - run: go build .
  generate-cuda:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
    strategy:
      matrix:
        cuda-version:
@ -95,8 +51,6 @@ jobs:
    steps:
      - run: |
          apt-get update && apt-get install -y git build-essential curl
          curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
            | tar -zx -C /usr --strip-components 1
        env:
          DEBIAN_FRONTEND: noninteractive
      - uses: actions/checkout@v4
@ -107,12 +61,11 @@ jobs:
      - run: go get ./...
      - run: |
          git config --global --add safe.directory /__w/ollama/ollama
-          go generate -x ./...
+          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
-        env:
+          make -j $cores cuda_v11
-          OLLAMA_SKIP_CPU_GENERATE: '1'
+  runners-linux-rocm:
  generate-rocm:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    strategy:
      matrix:
        rocm-version:
@ -122,8 +75,6 @@ jobs:
    steps:
      - run: |
          apt-get update && apt-get install -y git build-essential curl rocm-libs
          curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
            | tar -zx -C /usr --strip-components 1
        env:
          DEBIAN_FRONTEND: noninteractive
      - uses: actions/checkout@v4
@ -134,14 +85,13 @@ jobs:
      - run: go get ./...
      - run: |
          git config --global --add safe.directory /__w/ollama/ollama
-          go generate -x ./...
+          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
-        env:
+          make -j $cores rocm
          OLLAMA_SKIP_CPU_GENERATE: '1'
  # ROCm generation step
-  generate-windows-rocm:
+  runners-windows-rocm:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    runs-on: windows
    steps:
      - uses: actions/checkout@v4
@ -160,24 +110,42 @@ jobs:
      - name: 'Verify ROCm'
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
      - name: Install msys2
        run: |
          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
          write-host "Downloading msys2"
          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
          write-host "Installing msys2"
          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Install msys2 tools
        run: |
          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: verify tools
        run: |
          get-command gcc
          gcc --version
          get-command make
          make --version
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          go generate -x ./...
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
-        name: go generate
+          write-host $env:HIP_PATH
-        env:
+          make -C llama print-HIP_PATH print-HIP_LIB_DIR
-          OLLAMA_SKIP_CPU_GENERATE: '1'
+          make -j $cores rocm
        name: make
  # CUDA generation step
-  generate-windows-cuda:
+  runners-windows-cuda:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    runs-on: windows
    steps:
      - uses: actions/checkout@v4
@ -201,21 +169,40 @@ jobs:
          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
      - name: 'Verify CUDA'
        run: nvcc -V
      - name: Install msys2
        run: |
          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
          write-host "Downloading msys2"
          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
          write-host "Installing msys2"
          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Install msys2 tools
        run: |
          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: verify tools
        run: |
          get-command gcc
          gcc --version
          get-command make
          make --version
      - run: go get ./...
-      - name: go generate
+      - name: make
        run: |
          $gopath=(get-command go).source | split-path -parent
          $cudabin=(get-command nvcc).source | split-path
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$cudabin;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          go generate -x ./...
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
          make -j $cores cuda_v11
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
-  runners:
+  runners-cpu:
    needs: [changes]
    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    strategy:
@ -239,20 +226,41 @@ jobs:
          go-version-file: go.mod
          cache: true
      - run: go get ./...
      - name: Install msys2
        if: ${{ startsWith(matrix.os, 'windows-') }}
        run: |
          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
          write-host "Downloading msys2"
          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
          write-host "Installing msys2"
          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Install msys2 tools
        if: ${{ startsWith(matrix.os, 'windows-') }}
        run: |
          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: verify tools
        if: ${{ startsWith(matrix.os, 'windows-') }}
        run: |
          get-command gcc
          gcc --version
          get-command make
          make --version
      - name: 'Build Windows Go Runners'
        if: ${{ startsWith(matrix.os, 'windows-') }}
        run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$gccpath;$env:PATH"
          echo $env:PATH
-          make -C llama -j 4      
+          make -j 4      
      - name: 'Build Unix Go Runners'
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        run: make -C llama -j 4
+        run: make -j 4
      - run: go build .
  lint:
@ -302,9 +310,6 @@ jobs:
    env:
      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: '1'
      OLLAMA_CPU_TARGET: 'static'
      OLLAMA_SKIP_CPU_GENERATE: '1'
      OLLAMA_SKIP_METAL_GENERATE: '1'
    steps:
      - uses: actions/checkout@v4
        with:
@ -319,7 +324,6 @@ jobs:
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
      - run: go generate ./...
      - run: go build
      - run: go test -v ./...
@ -333,4 +337,4 @@ jobs:
          submodules: recursive
      - name: Verify patches carry all the changes
        run: |
-          cd llama && make apply-patches sync && git diff --compact-summary --exit-code .
+          make apply-patches sync && git diff --compact-summary --exit-code llama
--- a/.gitmodules
+++ b/.gitmodules
@ -1,4 +0,0 @@
 [submodule "llama.cpp"]
 	path = llm/llama.cpp
 	url = https://github.com/ggerganov/llama.cpp.git
 	shallow = true
--- a/284
+++ b/284
@ -6,168 +6,134 @@ ARG CUDA_VERSION_12=12.4.0
 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
-# Copy the minimal context we need to run the generate scripts
+### To create a local image for building linux binaries on mac or windows with efficient incremental builds
-FROM scratch AS llm-code
+#
-COPY .git .git
+# docker build --platform linux/amd64 -t builder-amd64 -f Dockerfile --target unified-builder-amd64 .
-COPY .gitmodules .gitmodules
+# docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
-COPY llm llm
+#
-
+### Then incremental builds will be much faster in this container
-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
+#
-ARG CMAKE_VERSION
+# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
-COPY ./scripts/rh_linux_deps.sh /
+#
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
 ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG CUDA_V11_ARCHITECTURES
 ENV GOARCH=amd64
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
    CUDA_VARIANT="_v11" \
    bash gen_linux.sh
 FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG CUDA_V12_ARCHITECTURES
 ENV GOARCH=amd64
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
    CUDA_VARIANT="_v12" \
    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
    bash gen_linux.sh
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG CUDA_V11_ARCHITECTURES
 ENV GOARCH=arm64
 RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
    CUDA_VARIANT="_v11" \
    bash gen_linux.sh
 FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG CUDA_V12_ARCHITECTURES
 ENV GOARCH=arm64
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 \
    OLLAMA_SKIP_CPU_GENERATE=1 \
    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
    CUDA_VARIANT="_v12" \
    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
    bash gen_linux.sh
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV LIBRARY_PATH=/opt/amdgpu/lib64
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ARG AMDGPU_TARGETS
 ENV GOARCH=amd64
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \
    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - )
 FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
+RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+    dnf clean all && \
-ARG OLLAMA_CUSTOM_CPU_DEFS
+    dnf install -y \
-ARG CGO_CFLAGS
+    zsh \
-ENV GOARCH=amd64
+    cuda-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+    cuda-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
 # TODO intel oneapi goes here...
 ENV GOARCH amd64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]
-FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
+### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
-RUN --mount=type=cache,target=/root/.ccache \
+# Note: this does not contain jetson variants
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
+#
-FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
+# docker build --platform linux/arm64 -t builder-arm64 -f Dockerfile --target unified-builder-arm64 .
-RUN --mount=type=cache,target=/root/.ccache \
+# docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh
+#
-FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
+FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh
 FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+    dnf config-manager --set-enabled appstream && \
-ARG OLLAMA_CUSTOM_CPU_DEFS
+    dnf clean all && \
-ARG CGO_CFLAGS
+    dnf install -y \
-ENV GOARCH=arm64
+    zsh \
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
 ENV GOARCH amd64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]
-FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
+FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
 ARG OLLAMA_SKIP_CUDA_11_GENERATE
 ARG OLLAMA_SKIP_CUDA_12_GENERATE
 ARG OLLAMA_SKIP_ROCM_GENERATE
 ARG CUDA_V11_ARCHITECTURES
 ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
+    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
        make -C llama -j $(expr $(nproc) / 2 ) ; \
    else \
        make -C llama -j 5 ; \
    fi
 FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
 ARG OLLAMA_SKIP_CUDA_11_GENERATE
 ARG OLLAMA_SKIP_CUDA_12_GENERATE
 ARG CUDA_V11_ARCHITECTURES
 ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
    make -C llama -j 8
 # Intermediate stages used for ./scripts/build_linux.sh
-FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
+FROM --platform=linux/amd64 centos:7 AS builder-amd64
-ENV CGO_ENABLED=1
+ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV CGO_ENABLED 1
 ENV GOARCH amd64
 WORKDIR /go/src/github.com/ollama/ollama
 FROM --platform=linux/amd64 builder-amd64 AS build-amd64
 COPY . .
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 ARG OLLAMA_SKIP_ROCM_GENERATE
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-amd64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN cd dist/linux-$GOARCH-rocm && \
+RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz
+    cd dist/linux-$GOARCH-rocm && \
    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
    fi
-FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
+FROM --platform=linux/arm64 rockylinux:8 AS builder-arm64
-ENV CGO_ENABLED=1
+ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 ENV CGO_ENABLED 1
 ENV GOARCH arm64
 WORKDIR /go/src/github.com/ollama/ollama
 FROM --platform=linux/arm64 builder-arm64 AS build-arm64
 COPY . .
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
@ -179,11 +145,11 @@ FROM --platform=linux/amd64 scratch AS dist-amd64
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
 FROM --platform=linux/arm64 scratch AS dist-arm64
 COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM dist-$TARGETARCH as dist
+FROM dist-$TARGETARCH AS dist
 # Optimized container images do not cary nested payloads
-FROM --platform=linux/amd64 cpu-builder-amd64 AS container-build-amd64
+FROM --platform=linux/amd64 builder-amd64 AS container-build-amd64
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS
@ -191,7 +157,7 @@ ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-amd64/bin/ollama .
-FROM --platform=linux/arm64 cpu-builder-arm64 AS container-build-arm64
+FROM --platform=linux/arm64 builder-arm64 AS container-build-arm64
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS
@ -199,48 +165,52 @@ ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-arm64/bin/ollama .
 # For amd64 container images, filter out cuda/rocm to minimize size
 FROM runners-amd64 AS runners-cuda-amd64
 RUN rm -rf \
    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
    ./dist/linux-amd64/lib/ollama/runners/rocm*
 FROM runners-amd64 AS runners-rocm-amd64
 RUN rm -rf \
    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
    ./dist/linux-amd64/lib/ollama/libcu*.so* \
    ./dist/linux-amd64/lib/ollama/runners/cuda*
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 # ROCm libraries larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
 # Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
 # across releases
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 EXPOSE 11434
-ENV OLLAMA_HOST=0.0.0.0
+ENV OLLAMA_HOST 0.0.0.0
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
 FROM runtime-$TARGETARCH
 EXPOSE 11434
-ENV OLLAMA_HOST=0.0.0.0
+ENV OLLAMA_HOST 0.0.0.0
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
--- a/4
+++ b/4
@ -0,0 +1,4 @@
 GOALS := $(or $(MAKECMDGOALS),all)
 .PHONY: $(GOALS)
 $(GOALS):
 	$(MAKE) -C llama $@
--- a/docs/development.md
+++ b/docs/development.md
@ -1,183 +1,5 @@
 # Development
 > [!IMPORTANT]
 > The `llm` package that loads and runs models is being updated to use a new [Go runner](#transition-to-go-runner): this should only impact a small set of PRs however it does change how the project is built.
 Install required tools:
 - cmake version 3.24 or higher
 - go version 1.22 or higher
 - gcc version 11.4.0 or higher
 ### MacOS
 ```bash
 brew install go cmake gcc
 ```
 Optionally enable debugging and more verbose logging:
 ```bash
 # At build time
 export CGO_CFLAGS="-g"
 # At runtime
 export OLLAMA_DEBUG=1
 ```
 Get the required libraries and build the native LLM code:
 ```bash
 go generate ./...
 ```
 Then build ollama:
 ```bash
 go build .
 ```
 Now you can run `ollama`:
 ```bash
 ./ollama
 ```
 ### Linux
 #### Linux CUDA (NVIDIA)
 _Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
 Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 development and runtime packages.
 Typically the build scripts will auto-detect CUDA, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
 libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
 a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
 Then generate dependencies:
 ```
 go generate ./...
 ```
 Then build the binary:
 ```
 go build .
 ```
 #### Linux ROCm (AMD)
 _Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
 Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `cmake` and `golang`.
 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `ROCM_PATH` to the location of the ROCm
 install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
 CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
 the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
 ```
 go generate ./...
 ```
 Then build the binary:
 ```
 go build .
 ```
 ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
 #### Advanced CPU Settings
 By default, running `go generate ./...` will compile a few different variations
 of the LLM library based on common CPU families and vector math capabilities,
 including a lowest-common-denominator which should run on almost any 64 bit CPU
 somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
 load. If you would like to build a CPU-based build customized for your
 processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
 like to use. For example, to compile an optimized binary for an Intel i9-9880H,
 you might use:
 ```
 OLLAMA_CUSTOM_CPU_DEFS="-DGGML_AVX=on -DGGML_AVX2=on -DGGML_F16C=on -DGGML_FMA=on" go generate ./...
 go build .
 ```
 #### Containerized Linux Build
 If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
 ### Windows
 Note: The Windows build for Ollama is still under development.
 First, install required tools:
 - MSVC toolchain - C/C++ and cmake as minimal requirements
 - Go version 1.22 or higher
 - MinGW (pick one variant) with GCC.
  - [MinGW-w64](https://www.mingw-w64.org/)
  - [MSYS2](https://www.msys2.org/)
 - The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
 Then, build the `ollama` binary:
 ```powershell
 $env:CGO_ENABLED="1"
 go generate ./...
 go build .
 ```
 #### Windows CUDA (NVIDIA)
 In addition to the common Windows development tools described above, install CUDA after installing MSVC.
 - [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
 #### Windows ROCm (AMD Radeon)
 In addition to the common Windows development tools described above, install AMDs HIP package after installing MSVC.
 - [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
 - [Strawberry Perl](https://strawberryperl.com/)
 Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
 #### Windows arm64
 The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want.  To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
 ```powershell
 import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
 Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
 ```
 You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
 Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment.  Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
 ```
 pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
 ```
 You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
 ## Transition to Go runner
 The Ollama team is working on moving to a new Go based runner that loads and runs models in a subprocess to replace the previous code under `ext_server`. During this transition period, this new Go runner is "opt in" at build time, and requires using a different approach to build.
 After the transition to use the Go server exclusively, both `make` and `go generate` will build the Go runner.
 Install required tools:
 - go version 1.22 or higher
@ -201,7 +23,7 @@ export OLLAMA_DEBUG=1
 Get the required libraries and build the native LLM code:  (Adjust the job count based on your number of processors for a faster build)
 ```bash
-make -C llama -j 5
+make -j 5
 ```
 Then build ollama:
@ -238,7 +60,7 @@ a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "
 Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
 ```
-make -C llama -j 5
+make -j 5
 ```
 Then build the binary:
@ -263,7 +85,7 @@ the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx
 Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
 ```
-make -C llama -j 5
+make -j 5
 ```
 Then build the binary:
@ -308,7 +130,7 @@ Then, build the `ollama` binary:
 ```powershell
 $env:CGO_ENABLED="1"
-make -C llama -j 8
+make -j 8
 go build .
 ```
--- a/llama/Dockerfile
+++ b/llama/Dockerfile
@ -1,221 +0,0 @@
 # Note: once we have fully transitioned to the Go server, this will replace the old Dockerfile at the top of the tree
 ARG GOLANG_VERSION=1.22.8
 ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION_11=11.3.1
 ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
 ### To create a local image for building linux binaries on mac or windows with efficient incremental builds
 #
 # docker build --platform linux/amd64 -t builder-amd64 -f llama/Dockerfile --target unified-builder-amd64 .
 # docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
 #
 ### Then incremental builds will be much faster in this container
 #
 # make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
 #
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
    dnf clean all && \
    dnf install -y \
    zsh \
    cuda-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
    cuda-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
 # TODO intel oneapi goes here...
 ENV GOARCH amd64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]
 ### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
 # Note: this does not contain jetson variants
 #
 # docker build --platform linux/arm64 -t builder-arm64 -f llama/Dockerfile --target unified-builder-arm64 .
 # docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
 #
 FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
    dnf config-manager --set-enabled appstream && \
    dnf clean all && \
    dnf install -y \
    zsh \
    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
 ENV GOARCH amd64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]
 FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
 ARG OLLAMA_SKIP_CUDA_11_GENERATE
 ARG OLLAMA_SKIP_CUDA_12_GENERATE
 ARG OLLAMA_SKIP_ROCM_GENERATE
 ARG CUDA_V11_ARCHITECTURES
 ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
        make -C llama -j $(expr $(nproc) / 2 ) ; \
    else \
        make -C llama -j 5 ; \
    fi
 FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
 ARG OLLAMA_SKIP_CUDA_11_GENERATE
 ARG OLLAMA_SKIP_CUDA_12_GENERATE
 ARG CUDA_V11_ARCHITECTURES
 ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
    make -C llama -j 8
 # Intermediate stages used for ./scripts/build_linux.sh
 FROM --platform=linux/amd64 centos:7 AS builder-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV CGO_ENABLED 1
 ENV GOARCH amd64
 WORKDIR /go/src/github.com/ollama/ollama
 FROM --platform=linux/amd64 builder-amd64 AS build-amd64
 COPY . .
 COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 ARG OLLAMA_SKIP_ROCM_GENERATE
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-amd64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
 RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
    cd dist/linux-$GOARCH-rocm && \
    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
    fi
 FROM --platform=linux/arm64 rockylinux:8 AS builder-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 ENV CGO_ENABLED 1
 ENV GOARCH arm64
 WORKDIR /go/src/github.com/ollama/ollama
 FROM --platform=linux/arm64 builder-arm64 AS build-arm64
 COPY . .
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-arm64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
 FROM --platform=linux/amd64 scratch AS dist-amd64
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
 FROM --platform=linux/arm64 scratch AS dist-arm64
 COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
 FROM dist-$TARGETARCH AS dist
 # Optimized container images do not cary nested payloads
 FROM --platform=linux/amd64 builder-amd64 AS container-build-amd64
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-amd64/bin/ollama .
 FROM --platform=linux/arm64 builder-arm64 AS container-build-arm64
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-arm64/bin/ollama .
 # For amd64 container images, filter out cuda/rocm to minimize size
 FROM runners-amd64 AS runners-cuda-amd64
 RUN rm -rf \
    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
    ./dist/linux-amd64/lib/ollama/runners/rocm*
 FROM runners-amd64 AS runners-rocm-amd64
 RUN rm -rf \
    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
    ./dist/linux-amd64/lib/ollama/libcu*.so* \
    ./dist/linux-amd64/lib/ollama/runners/cuda*
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 # ROCm libraries larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
 # Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
 # across releases
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
 FROM runtime-$TARGETARCH
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_VISIBLE_DEVICES=all
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
--- a/llama/README.md
+++ b/llama/README.md
@ -95,31 +95,17 @@ make -j
 Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model.  While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit.  A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
 > [!IMPORTANT]
 > Prior to merging #7157 we continue to leverage a submodule for llama.cpp which establishes the tracking commit.  After merging that PR a new manifest file we be utilized
 If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.
 ```
-make -C llama apply-patches
+make apply-patches
 ```
 ### Updating Base Commit
 **Pin to new base commit**
-To update to a newer base commit, select the upstream git tag or commit
+To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring.env`
 > [!IMPORTANT]
 > After merging #7157 a manifest will be used instead of the submodule
 ```
 cd llm/llama.cpp
 git fetch
 git checkout NEW_BASE_COMMIT
 cd ..
 git add llama.cpp
 ```
 #### Applying patches
@ -128,13 +114,13 @@ When updating to a newer base commit, the existing patches may not apply cleanly
 Start by applying the patches.  If any of the patches have conflicts, the `git am` will stop at the first failure.
 ```
-make -C llama apply-patches
+make apply-patches
 ```
 If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed.  Save the file(s) and continue the patch series with `git am --continue` .  If any additional patches fail, follow the same pattern until the full patch series is applied.  Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
 ```
-make -C llama create-patches sync
+make create-patches sync
 ```
 Build and test Ollama, and make any necessary changes to the Go code based on the new base commit.  Submit your PR to the Ollama repo.
@ -144,14 +130,14 @@ Build and test Ollama, and make any necessary changes to the Go code based on th
 When working on new fixes or features that impact vendored code, use the following model.  First get a clean tracking repo with all current patches applied:
 ```
-make -C llama apply-patches
+make apply-patches
 ```
 Now edit the upstream native code in the `./vendor/` directory.  You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing.  Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
 ```
-make -C llama sync
+make sync
-make -C llama -j 8
+make -j 8
 go build .
 ```
@ -161,7 +147,7 @@ go build .
 Iterate until you're ready to submit PRs.  Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
 ```
-make -C llama create-patches
+make create-patches
 ```
 > [!IMPORTANT]
--- a/llama/llama.go
+++ b/llama/llama.go
@ -1,5 +1,7 @@
 package llama
 //go:generate make -j 8
 /*
 #cgo CFLAGS: -O2 -std=c11 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE
 #cgo CXXFLAGS: -O2 -std=c++11 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE
--- a/llama/make/Makefile.sync
+++ b/llama/make/Makefile.sync
@ -1,11 +1,12 @@
 # Helpers for managing our vendored llama.cpp repo and patch set
-# TODO - this should include a manifest file at the top of the tree 
+REPO_ROOT:=$(dir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
-LLAMACPP_BASE_COMMIT=$(shell cd ../llm/llama.cpp && git rev-parse HEAD)
+DST_DIR:=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
-LLAMACPP_REPO := $(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))vendor/
+include $(REPO_ROOT)llama/vendoring
 LLAMACPP_REPO := $(REPO_ROOT)llama/vendor/
 DST_DIR=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
 LLAMACPP_PATCH_DIR := $(DST_DIR)patches/
--- a/llama/vendoring
+++ b/llama/vendoring
@ -0,0 +1 @@
 LLAMACPP_BASE_COMMIT=3f1ae2e32cde00c39b96be6d01c2997c29bae555
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@ -1,15 +0,0 @@
 set(TARGET ollama_llama_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp utils.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
 target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
    target_link_options(${TARGET} PRIVATE -municode -Wl,/subsystem:console)
 endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/llm/ext_server/httplib.h
+++ b/llm/ext_server/httplib.h
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
--- a/llm/ext_server/utils.hpp
+++ b/llm/ext_server/utils.hpp
@ -1,661 +0,0 @@
 // MIT License
 // Copyright (c) 2023 Georgi Gerganov
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 // The above copyright notice and this permission notice shall be included in all
 // copies or substantial portions of the Software.
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 #pragma once
 #include <string>
 #include <vector>
 #include <set>
 #include <mutex>
 #include <condition_variable>
 #include <unordered_map>
 #include <random>
 #include <iostream>
 #include <thread>
 #include "json.hpp"
 #include "../llava/clip.h"
 using json = nlohmann::json;
 extern bool server_verbose;
 extern bool server_log_json;
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
 #if SERVER_VERBOSE != 1
 #define LOG_VERBOSE(MSG, ...)
 #else
 #define LOG_VERBOSE(MSG, ...)                                            \
    do                                                                   \
    {                                                                    \
        if (server_verbose)                                              \
        {                                                                \
            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
        }                                                                \
    } while (0)
 #endif
 #define LOG_ERROR(  MSG, ...) server_log("ERROR",  __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_DEBUG(  MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__)
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
    SERVER_STATE_ERROR           // An error occurred, load_model failed
 };
 enum task_type {
    TASK_TYPE_COMPLETION,
    TASK_TYPE_CANCEL,
    TASK_TYPE_NEXT_RESPONSE,
    TASK_TYPE_METRICS
 };
 struct task_server {
    int id = -1; // to be filled by llama_server_queue
    int target_id;
    task_type type;
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
    int multitask_id = -1;
 };
 struct task_result {
    int id;
    int multitask_id = -1;
    bool stop;
    bool error;
    json result_json;
 };
 struct task_multi {
    int id;
    std::set<int> subtasks_remaining{};
    std::vector<task_result> results{};
 };
 // completion token output with probabilities
 struct completion_token_output {
    struct token_prob
    {
        llama_token tok;
        float prob;
    };
    std::vector<token_prob> probs;
    llama_token tok;
    std::string text_to_send;
 };
 struct token_translator {
    llama_context * ctx;
    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
 };
 static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
    std::stringstream ss_tid;
    ss_tid << std::this_thread::get_id();
    json log = nlohmann::ordered_json{
        {"tid", ss_tid.str()},
        {"timestamp", time(nullptr)},
    };
    if (strncmp("DEBUG", level, strlen(level)) == 0 && !server_verbose) {
        return;
    }
    if (server_log_json) {
        log.merge_patch(
                {
                        {"level",     level},
                        {"function",  function},
                        {"line",      line},
                        {"msg",       message},
                });
        if (!extra.empty()) {
            log.merge_patch(extra);
        }
        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
    } else {
        if (!extra.empty()) {
            log.merge_patch(extra);
        }
        std::stringstream ss;
        ss << level << " [" << function << "] " << message << " |";
        for (const auto& el : log.items())
        {
            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
            ss << " " << el.key() << "=" << value;
        }
        const std::string str = ss.str();
        printf("%.*s\n", (int)str.size(), str.data());
        fflush(stdout);
    }
 }
 //
 // server utils
 //
 template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value) {
    // Fallback null to default value
    return body.contains(key) && !body.at(key).is_null()
        ? body.value(key, default_value)
        : default_value;
 }
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 inline bool verify_custom_template(const std::string & tmpl) {
    llama_chat_message chat[] = {{"user", "test"}};
    std::vector<char> buf(1);
    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
    return res >= 0;
 }
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
    size_t alloc_size = 0;
    // vector holding all allocated string to be passed to llama_chat_apply_template
    std::vector<std::string> str(messages.size() * 2);
    std::vector<llama_chat_message> chat(messages.size());
    for (size_t i = 0; i < messages.size(); ++i) {
        auto &curr_msg = messages[i];
        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
        alloc_size     += str[i*2 + 1].length();
        chat[i].role    = str[i*2 + 0].c_str();
        chat[i].content = str[i*2 + 1].c_str();
    }
    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
    std::vector<char> buf(alloc_size * 2);
    // run the first time to get the total output length
    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    // if it turns out that our buffer is too small, we resize it
    if ((size_t) res > buf.size()) {
        buf.resize(res);
        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    }
    std::string formatted_chat(buf.data(), res);
    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
    return formatted_chat;
 }
 //
 // work queue utils
 //
 struct llama_server_queue {
    int id = 0;
    std::mutex mutex_tasks;
    bool running;
    // queues
    std::vector<task_server> queue_tasks;
    std::vector<task_server> queue_tasks_deferred;
    std::vector<task_multi> queue_multitasks;
    std::condition_variable condition_tasks;
    // callback functions
    std::function<void(task_server&)> callback_new_task;
    std::function<void(task_multi&)> callback_finish_multitask;
    std::function<void(void)> callback_run_slots;
    // Add a new task to the end of the queue
    int post(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        if (task.id == -1) {
            task.id = id++;
            LOG_VERBOSE("new task id", {{"new_id", task.id}});
        }
        queue_tasks.push_back(std::move(task));
        condition_tasks.notify_one();
        return task.id;
    }
    // Add a new task, but defer until one slot is available
    void defer(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        queue_tasks_deferred.push_back(std::move(task));
    }
    // Get the next id for creating anew task
    int get_new_id() {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        int new_id = id++;
        LOG_VERBOSE("new task id", {{"new_id", new_id}});
        return new_id;
    }
    // Register function to process a new task
    void on_new_task(std::function<void(task_server&)> callback) {
        callback_new_task = callback;
    }
    // Register function to process a multitask when it is finished
    void on_finish_multitask(std::function<void(task_multi&)> callback) {
        callback_finish_multitask = callback;
    }
    // Register the function to be called when all slots data is ready to be processed
    void on_run_slots(std::function<void(void)> callback) {
        callback_run_slots = callback;
    }
    // Call when the state of one slot is changed
    void notify_slot_changed() {
        // move deferred tasks back to main loop
        std::unique_lock<std::mutex> lock(mutex_tasks);
        for (auto & task : queue_tasks_deferred) {
            queue_tasks.push_back(std::move(task));
        }
        queue_tasks_deferred.clear();
    }
    // end the start_loop routine
    void terminate() {
        {
            std::unique_lock<std::mutex> lock(mutex_tasks);
            running = false;
        }
        condition_tasks.notify_all();
    }
    /**
     * Main loop consists of these steps:
     * - Wait until a new task arrives
     * - Process the task (i.e. maybe copy data into slot)
     * - Check if multitask is finished
     * - Run all slots
     */
    void start_loop() {
        running = true;
        while (true) {
            LOG_VERBOSE("new task may arrive", {});
            {
                while (true)
                {
                    std::unique_lock<std::mutex> lock(mutex_tasks);
                    if (queue_tasks.empty()) {
                        lock.unlock();
                        break;
                    }
                    task_server task = queue_tasks.front();
                    queue_tasks.erase(queue_tasks.begin());
                    lock.unlock();
                    LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
                    callback_new_task(task);
                }
                LOG_VERBOSE("update_multitasks", {});
                // check if we have any finished multitasks
                auto queue_iterator = queue_multitasks.begin();
                while (queue_iterator != queue_multitasks.end())
                {
                    if (queue_iterator->subtasks_remaining.empty())
                    {
                        // all subtasks done == multitask is done
                        task_multi current_multitask = *queue_iterator;
                        callback_finish_multitask(current_multitask);
                        // remove this multitask
                        queue_iterator = queue_multitasks.erase(queue_iterator);
                    }
                    else
                    {
                        ++queue_iterator;
                    }
                }
                // all tasks in the current loop is processed, slots data is now ready
                LOG_VERBOSE("callback_run_slots", {});
                callback_run_slots();
            }
            LOG_VERBOSE("wait for new task", {});
            // wait for new task
            {
                std::unique_lock<std::mutex> lock(mutex_tasks);
                if (queue_tasks.empty()) {
                    if (!running) {
                        LOG_VERBOSE("ending start_loop", {});
                        return;
                    }
                    condition_tasks.wait(lock, [&]{
                        return (!queue_tasks.empty() || !running);
                    });
                }
            }
        }
    }
    //
    // functions to manage multitasks
    //
    // add a multitask by specifying the id of all subtask (subtask is a task_server)
    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        task_multi multi;
        multi.id = multitask_id;
        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
        queue_multitasks.push_back(multi);
    }
    // updatethe remaining subtasks, while appending results to multitask
    void update_multitask(int multitask_id, int subtask_id, task_result& result)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        for (auto& multitask : queue_multitasks)
        {
            if (multitask.id == multitask_id)
            {
                multitask.subtasks_remaining.erase(subtask_id);
                multitask.results.push_back(result);
            }
        }
    }
 };
 struct llama_server_response {
    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
    callback_multitask_t callback_update_multitask;
    // for keeping track of all tasks waiting for the result
    std::set<int> waiting_task_ids;
    // the main result queue
    std::vector<task_result> queue_results;
    std::mutex mutex_results;
    std::condition_variable condition_results;
    // add the task_id to the list of tasks waiting for response
    void add_waiting_task_id(int task_id) {
        LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.insert(task_id);
    }
    // when the request is finished, we can remove task associated with it
    void remove_waiting_task_id(int task_id) {
        LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.erase(task_id);
    }
    // This function blocks the thread until there is a response for this task_id
    task_result recv(int task_id) {
        while (true)
        {
            std::unique_lock<std::mutex> lock(mutex_results);
            condition_results.wait(lock, [&]{
                return !queue_results.empty();
            });
            for (int i = 0; i < (int) queue_results.size(); i++)
            {
                if (queue_results[i].id == task_id)
                {
                    assert(queue_results[i].multitask_id == -1);
                    task_result res = queue_results[i];
                    queue_results.erase(queue_results.begin() + i);
                    return res;
                }
            }
        }
        // should never reach here
    }
    // Register the function to update multitask
    void on_multitask_update(callback_multitask_t callback) {
        callback_update_multitask = callback;
    }
    // Send a new result to a waiting task_id
    void send(task_result result) {
        std::unique_lock<std::mutex> lock(mutex_results);
        LOG_VERBOSE("send new result", {{"task_id", result.id}});
        for (auto& task_id : waiting_task_ids) {
            // LOG_TEE("waiting task id %i \n", task_id);
            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
            if (result.multitask_id == task_id)
            {
                LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
                callback_update_multitask(task_id, result.id, result);
                continue;
            }
            if (result.id == task_id)
            {
                LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
                queue_results.push_back(result);
                condition_results.notify_all();
                return;
            }
        }
    }
 };
 //
 // base64 utils (TODO: move to common in the future)
 //
 static const std::string base64_chars =
             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
             "abcdefghijklmnopqrstuvwxyz"
             "0123456789+/";
 static inline bool is_base64(uint8_t c)
 {
    return (isalnum(c) || (c == '+') || (c == '/'));
 }
 static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 {
    int i = 0;
    int j = 0;
    int in_ = 0;
    int in_len = encoded_string.size();
    uint8_t char_array_4[4];
    uint8_t char_array_3[3];
    std::vector<uint8_t> ret;
    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
    {
        char_array_4[i++] = encoded_string[in_]; in_++;
        if (i == 4)
        {
            for (i = 0; i <4; i++)
            {
                char_array_4[i] = base64_chars.find(char_array_4[i]);
            }
            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
            for (i = 0; (i < 3); i++)
            {
                ret.push_back(char_array_3[i]);
            }
            i = 0;
        }
    }
    if (i)
    {
        for (j = i; j <4; j++)
        {
            char_array_4[j] = 0;
        }
        for (j = 0; j <4; j++)
        {
            char_array_4[j] = base64_chars.find(char_array_4[j]);
        }
        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
        for (j = 0; (j < i - 1); j++)
        {
            ret.push_back(char_array_3[j]);
        }
    }
    return ret;
 }
 //
 // random string / id
 //
 static std::string random_string()
 {
    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    std::random_device rd;
    std::mt19937 generator(rd());
    std::string result(32, ' ');
    for (int i = 0; i < 32; ++i) {
        result[i] = str[generator() % str.size()];
    }
    return result;
 }
 static std::string gen_chatcmplid()
 {
    std::stringstream chatcmplid;
    chatcmplid << "chatcmpl-" << random_string();
    return chatcmplid.str();
 }
 //
 // other common utils
 //
 static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
 {
    size_t i;
    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
    {
    }
    return i;
 }
 static bool ends_with(const std::string &str, const std::string &suffix)
 {
    return str.size() >= suffix.size() &&
           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
 static size_t find_partial_stop_string(const std::string &stop,
                                       const std::string &text)
 {
    if (!text.empty() && !stop.empty())
    {
        const char text_last_char = text.back();
        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
        {
            if (stop[char_index] == text_last_char)
            {
                const std::string current_partial = stop.substr(0, char_index + 1);
                if (ends_with(text, current_partial))
                {
                    return text.size() - char_index - 1;
                }
            }
        }
    }
    return std::string::npos;
 }
 // TODO: reuse llama_detokenize
 template <class Iter>
 static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
 {
    std::string ret;
    for (; begin != end; ++begin)
    {
        ret += llama_token_to_piece(ctx, *begin);
    }
    return ret;
 }
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
    // if the size is 1 and first bit is 1, meaning it's a partial character
    //   (size > 1 meaning it's already a known token)
    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
    {
        std::stringstream ss;
        ss << std::hex << (out[0] & 0xff);
        std::string res(ss.str());
        out = "byte: \\x" + res;
    }
    return out;
 }
 // convert a vector of completion_token_output to json
 static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
 {
    json out = json::array();
    for (const auto &prob : probs)
    {
        json probs_for_token = json::array();
        for (const auto &p : prob.probs)
        {
            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
            probs_for_token.push_back(json
            {
                {"tok_str", tok_str},
                {"prob",    p.prob},
            });
        }
        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
        out.push_back(json{
            {"content", tok_str},
            {"probs",   probs_for_token},
        });
    }
    return out;
 }
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@ -1,137 +0,0 @@
 # common logic across linux and darwin
 init_vars() {
    case "${GOARCH}" in
    "amd64")
        ARCH="x86_64"
        ;;
    "arm64")
        ARCH="arm64"
        ;;
    *)
        echo "GOARCH must be set"
        echo "this script is meant to be run from within go generate"
        exit 1
        ;;
    esac
    LLAMACPP_DIR=../llama.cpp
    CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
    CMAKE_TARGETS="--target ollama_llama_server"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
    else
        # TODO - add additional optimization flags...
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
    fi
    case $(uname -s) in
    "Darwin")
        LIB_EXT="dylib"
        WHOLE_ARCHIVE="-Wl,-force_load"
        NO_WHOLE_ARCHIVE=""
        GCC_ARCH="-arch ${ARCH}"
        DIST_BASE=../../dist/darwin-${GOARCH}/
        PAYLOAD_BASE=../../build/darwin/${GOARCH}
        ;;
    "Linux")
        LIB_EXT="so"
        WHOLE_ARCHIVE="-Wl,--whole-archive"
        NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
        # Cross compiling not supported on linux - Use docker
        GCC_ARCH=""
        DIST_BASE=../../dist/linux-${GOARCH}/
        PAYLOAD_BASE=../../build/linux/${GOARCH}
        ;;
    *)
        ;;
    esac
    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
    fi
    GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
    RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
 }
 git_module_setup() {
    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
        echo "Skipping submodule initialization"
        return
    fi
    # Make sure the tree is clean after the directory moves
    if [ -d "${LLAMACPP_DIR}/gguf" ]; then
        echo "Cleaning up old submodule"
        rm -rf ${LLAMACPP_DIR}
    fi
    git submodule init
    git submodule update --force ${LLAMACPP_DIR}
 }
 apply_patches() {
    # apply temporary patches until fix is upstream
    for patch in ../patches/*.patch; do
        git -c 'user.name=nobody' -c 'user.email=<>' -C ${LLAMACPP_DIR} am ${patch}
    done
 }
 build() {
    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
    # remove unnecessary build artifacts
    rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
 }
 dist() {
    [ -z "${RUNNER}" ] && exit 1
    mkdir -p ${RUNNER_BASE}/${RUNNER}/
    for f in ${BUILD_DIR}/bin/* ; do
        cp ${f} ${RUNNER_BASE}/${RUNNER}/
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
            cp ${f} ${RUNNER_BASE}/${RUNNER}/
        done
    fi
 }
 # Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
 compress() {
    [ -z "${RUNNER}" ] && exit 1
    echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
    rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
    mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
    for f in ${BUILD_DIR}/bin/* ; do
        ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
        compress_pids+=" $!"
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
            ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
            compress_pids+=" $!"
        done
    fi
    echo
 }
 wait_for_compress() {
    for pid in ${compress_pids}; do
        wait $pid
    done
    echo "Finished compression"
 }
 install() {
    echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do
        rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
        cp -af "${lib}" "${BUILD_DIR}/bin/"
    done
 }
 # Keep the local tree clean after we're done with the build
 cleanup() {
    git submodule update --force ${LLAMACPP_DIR}
 }
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@ -1,91 +0,0 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
 # working directory must be ./llm/generate/
 # TODO - add hardening to detect missing tools (cmake, etc.)
 set -ex
 set -o pipefail
 compress_pids=""
 echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
 apply_patches
 sign() {
    if [ -n "$APPLE_IDENTITY" ]; then
        codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1
    fi
 }
 COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
 case "${GOARCH}" in
 "amd64")
    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DGGML_METAL=off -DGGML_NATIVE=off"
    if [ -z "$OLLAMA_SKIP_CPU_GENERATE" ]; then
        #
        # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
        RUNNER=cpu
        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
        echo "Building LCD CPU"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
        compress
        #
        # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
        # Approximately 400% faster than LCD on same CPU
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
        RUNNER=cpu_avx
        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
        echo "Building AVX CPU"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
        compress
        #
        # ~2013 CPU Dynamic library
        # Approximately 10% faster than AVX on same CPU
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
        RUNNER=cpu_avx2
        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
        echo "Building AVX2 CPU"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
        compress
    fi
    ;;
 "arm64")
    if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
        init_vars
        CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
        RUNNER="metal"
        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
        compress
    fi
    ;;
 *)
    echo "GOARCH must be set"
    echo "this script is meant to be run from within go generate"
    exit 1
    ;;
 esac
 cleanup
 wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -1,285 +0,0 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
 # working directory must be llm/generate/
 # First we build one or more CPU based LLM libraries
 #
 # Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
 # library dependencies
 #
 # Then if we detect ROCm, we build a dynamically loaded ROCm lib.  The ROCM
 # libraries are quite large, and also dynamically load data files at runtime
 # which in turn are large, so we don't attempt to cary them as payload
 set -ex
 set -o pipefail
 compress_pids=""
 # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
 amdGPUs() {
    if [ -n "${AMDGPU_TARGETS}" ]; then
        echo "${AMDGPU_TARGETS}"
        return
    fi
    GPU_LIST=(
        "gfx900"
        "gfx906:xnack-"
        "gfx908:xnack-"
        "gfx90a:xnack+"
        "gfx90a:xnack-"
        "gfx940"
        "gfx941"
        "gfx942"
        "gfx1010"
        "gfx1012"
        "gfx1030"
        "gfx1100"
        "gfx1101"
        "gfx1102"
    )
    (
        IFS=$';'
        echo "'${GPU_LIST[*]}'"
    )
 }
 echo "Starting linux generate script"
 if [ -z "${CUDACXX}" ]; then
    if [ -x /usr/local/cuda/bin/nvcc ]; then
        export CUDACXX=/usr/local/cuda/bin/nvcc
    else
        # Try the default location in case it exists
        export CUDACXX=$(command -v nvcc)
    fi
 fi
 COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
 apply_patches
 init_vars
 if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
    # Users building from source can tune the exact flags we pass to cmake for configuring
    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
        init_vars
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
        RUNNER="cpu"
        BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
        echo "Building custom CPU"
        build
        install
        dist
        compress
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
        # -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
        # -DGGML_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
        # -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
        # -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
        # Note: the following seem to yield slower results than AVX2 - ymmv
        # -DGGML_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
        # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
        # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
            #
            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
            #
            init_vars
            CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
            RUNNER=cpu
            BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
            echo "Building LCD CPU"
            build
            install
            dist
            compress
        fi
        if [ "${ARCH}" == "x86_64" ]; then
            #
            # ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
            #
            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
                #
                # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
                # Approximately 400% faster than LCD on same CPU
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
                RUNNER=cpu_avx
                BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
                echo "Building AVX CPU"
                build
                install
                dist
                compress
            fi
            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
                #
                # ~2013 CPU Dynamic library
                # Approximately 10% faster than AVX on same CPU
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
                RUNNER=cpu_avx2
                BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
                echo "Building AVX2 CPU"
                build
                install
                dist
                compress
            fi
        fi
    fi
 else
    echo "Skipping CPU generation step as requested"
 fi
 # If needed, look for the default CUDA toolkit location
 if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
    CUDA_LIB_DIR=/usr/local/cuda/lib64
 fi
 # If needed, look for CUDA on Arch Linux
 if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
    CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
 fi
 # Allow override in case libcudart is in the wrong place
 if [ -z "${CUDART_LIB_DIR}" ]; then
    CUDART_LIB_DIR="${CUDA_LIB_DIR}"
 fi
 if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
    echo "CUDA libraries detected - building dynamic CUDA library"
    init_vars
    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
    if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
    if [ "${ARCH}" == "arm64" ]; then
        echo "ARM CPU detected - disabling unsupported AVX instructions"
        # ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
        #
        # CUDA compute < 6.0 lacks proper FP16 support on ARM.
        # Disabling has minimal performance effect while maintaining compatibility.
        ARM64_DEFS="-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_CUDA_F16=off"
    fi
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
    if [ -n "${OLLAMA_CUSTOM_CUDA_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_CUDA_DEFS=\"${OLLAMA_CUSTOM_CUDA_DEFS}\""
        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
        echo "Building custom CUDA GPU"
    else
        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
    fi
    export CUDAFLAGS="-t8"
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
    RUNNER=cuda${CUDA_VARIANT}
    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
    build
    install
    dist
    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
    mkdir -p "${CUDA_DIST_DIR}"
    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
        cp -a "${lib}" "${CUDA_DIST_DIR}"
    done
    compress
 fi
 if [ -z "${ONEAPI_ROOT}" ]; then
    # Try the default location in case it exists
    ONEAPI_ROOT=/opt/intel/oneapi
 fi
 if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    echo "OneAPI libraries detected - building dynamic OneAPI library"
    init_vars
    source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
    CC=icx
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
    RUNNER=oneapi
    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
    build
    # copy oneAPI dependencies
    mkdir -p "${ONEAPI_DIST_DIR}"
    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
        cp -a "${dep}" "${ONEAPI_DIST_DIR}"
    done
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
    install
    dist
    compress
 fi
 if [ -z "${ROCM_PATH}" ]; then
    # Try the default location in case it exists
    ROCM_PATH=/opt/rocm
 fi
 if [ -z "${CLBlast_DIR}" ]; then
    # Try the default location in case it exists
    if [ -d /usr/lib/cmake/CLBlast ]; then
        export CLBlast_DIR=/usr/lib/cmake/CLBlast
    fi
 fi
 if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
    echo "ROCm libraries detected - building dynamic ROCm library"
    if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
    fi
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
    if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
        CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
        echo "Building custom ROCM GPU"
    fi
    RUNNER=rocm${ROCM_VARIANT}
    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
    # ROCm dependencies are too large to fit into a unified bundle
    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
    # TODO figure out how to disable runpath (rpath)
    # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
    export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build
    # copy the ROCM dependencies
    mkdir -p "${ROCM_DIST_DIR}"
    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
        cp -a "${dep}"* "${ROCM_DIST_DIR}"
        if [ $(readlink -f "${dep}") != "${dep}" ] ; then
            cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
        fi
    done
    install
    dist
    compress
 fi
 cleanup
 wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -1,403 +0,0 @@
 #!powershell
 $ErrorActionPreference = "Stop"
 function amdGPUs {
    if ($env:AMDGPU_TARGETS) {
        return $env:AMDGPU_TARGETS
    }
    # Current supported rocblas list from ROCm v6.1.2 on windows
    # https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus
    $GPU_LIST = @(
        "gfx1030"
        "gfx1100"
        "gfx1101"
        "gfx1102"
    )
    $GPU_LIST -join ';'
 }
 function init_vars {
    write-host "Checking for cmake..."
    get-command cmake
    write-host "Checking for ninja..."
    $d=(get-command -ea 'silentlycontinue' ninja).path
    if ($null -eq $d) {
        $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
        $matches=(gci -path $MSVC_INSTALL -r -fi ninja.exe)
        if ($matches.count -eq 0) {
            throw "Unable to locate ninja"
        }
        $ninjaDir=($matches[0].FullName | split-path -parent)
        $env:PATH="$env:PATH;$ninjaDir"
    }
    if (!$script:SRC_DIR) {
        $script:SRC_DIR = $(resolve-path "..\..\")
    }
    if (!$script:llamacppDir) {
        $script:llamacppDir = "../llama.cpp"
    }
    if (!$script:cmakeTargets) {
        $script:cmakeTargets = @("ollama_llama_server")
    }
    $script:cmakeDefs = @(
        "-DBUILD_SHARED_LIBS=on",
        "-DGGML_NATIVE=off",
        "-DGGML_OPENMP=off"
        )
    $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
    md "$script:DIST_BASE" -ea 0 > $null
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
        $script:config = "RelWithDebInfo"
    } else {
        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off", "-DCMAKE_BUILD_TYPE=Release")
        $script:config = "Release"
    }
    if ($null -ne $env:CMAKE_SYSTEM_VERSION) {
        $script:cmakeDefs += @("-DCMAKE_SYSTEM_VERSION=${env:CMAKE_SYSTEM_VERSION}")
    }
    # Try to find the CUDA dir
    if ($env:CUDA_LIB_DIR -eq $null) {
        $d=(get-command -ea 'silentlycontinue' nvcc).path
        if ($d -ne $null) {
            $script:CUDA_LIB_DIR=($d| split-path -parent)
            $script:CUDA_INCLUDE_DIR=($script:CUDA_LIB_DIR|split-path -parent)+"\include"
        }
    } else {
        $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
    }
    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
    if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
        $script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
    } else {
        $script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
    }
    # Note: Windows Kits 10 signtool crashes with GCP's plugin
    if ($null -eq $env:SIGN_TOOL) {
        ${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
    } else {
        ${script:SignTool}=${env:SIGN_TOOL}
    }
    if ("${env:KEY_CONTAINER}") {
        ${script:OLLAMA_CERT}=$(resolve-path "${script:SRC_DIR}\ollama_inc.crt")
    }
 }
 function git_module_setup {
    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
    & git submodule init
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    & git submodule update --force "${script:llamacppDir}"
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }
 function apply_patches {
    # Apply temporary patches until fix is upstream
    foreach ($patch in $(Get-ChildItem "../patches/*.patch")) {
        git -c 'user.name=nobody' -c 'user.email=<>' -C "${script:llamacppDir}" am $patch.FullName
    }
 }
 function build {
    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
    & cmake --version
    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ($cmakeDefs -contains "-G") {
        $extra=@("-j8")
    } else {
        $extra= @("--", "/maxCpuCount:8")
    }
    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    # Rearrange output to be consistent between different generators
    if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
        mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
        remove-item "${script:buildDir}/bin/${script:config}"
    }
 }
 function sign {
    if ("${env:KEY_CONTAINER}") {
        write-host "Signing ${script:buildDir}/bin/*.exe  ${script:buildDir}/bin/*.dll"
        foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
    }
 }
 function install {
    write-host "Installing binaries to dist dir ${script:distDir}"
    mkdir ${script:distDir} -ErrorAction SilentlyContinue
    $binaries = dir "${script:buildDir}/bin/*.exe"
    foreach ($file in $binaries) {
        copy-item -Path $file -Destination ${script:distDir} -Force
    }
    write-host "Installing dlls to dist dir ${script:distDir}"
    $dlls = dir "${script:buildDir}/bin/*.dll"
    foreach ($file in $dlls) {
        copy-item -Path $file -Destination ${script:distDir} -Force
    }
 }
 function cleanup {
    $patches = Get-ChildItem "../patches/*.diff"
    foreach ($patch in $patches) {
        # Extract file paths from the patch file
        $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
            $parts = $_ -split ' '
            ($parts[1] -split '/', 2)[1]
        }
        # Checkout each file
        foreach ($file in $filePaths) {
            git -C "${script:llamacppDir}" checkout $file
        }
        git -C "${script:llamacppDir}" checkout CMakeLists.txt
    }
 }
 # -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
 # -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
 # -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
 function build_cpu_x64 {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
        init_vars
        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=off", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu"
        $script:distDir="$script:DIST_BASE\cpu"
        write-host "Building LCD CPU"
        build
        sign
        install
    } else {
        write-host "Skipping CPU generation step as requested"
    }
 }
 function build_cpu_arm64 {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
        init_vars
        write-host "Checking for clang..."
        get-command clang
        $env:CFLAGS="-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only"
        $env:CXXFLAGS="$env:CFLAGS"
        $env:LDFLAGS="-static-libstdc++"
        $script:cmakeDefs = $script:commonCpuDefs + @(
            "-DCMAKE_VERBOSE_MAKEFILE=on",
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DMSVC_RUNTIME_LIBRARY=MultiThreaded"
        ) + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu"
        $script:distDir="$script:DIST_BASE\cpu"
        write-host "Building LCD CPU"
        build
        sign
        install
    } else {
        write-host "Skipping CPU generation step as requested"
    }
 }
 function build_cpu_avx() {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
        init_vars
        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=on", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
        $script:distDir="$script:DIST_BASE\cpu_avx"
        write-host "Building AVX CPU"
        build
        sign
        install
    } else {
        write-host "Skipping CPU AVX generation step as requested"
    }
 }
 function build_cpu_avx2() {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
        init_vars
        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=on", "-DGGML_AVX2=on", "-DGGML_AVX512=off", "-DGGML_FMA=on", "-DGGML_F16C=on") + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
        $script:distDir="$script:DIST_BASE\cpu_avx2"
        write-host "Building AVX2 CPU"
        build
        sign
        install
    } else {
        write-host "Skipping CPU AVX2 generation step as requested"
    }
 }
 function build_cuda() {
    if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
        # Then build cuda as a dynamically loaded library
        $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
        $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
        if ($null -ne $script:CUDA_VERSION) {
            $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
        }
        init_vars
        $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
        $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
        $script:cmakeDefs += @(
            "-A", "x64",
            "-DGGML_CUDA=ON",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
            "-DCMAKE_CUDA_FLAGS=-t6",
            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
            "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
            )
        if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
            write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
            $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
            write-host "building custom CUDA GPU"
        }
        build
        sign
        install
        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    } else {
        write-host "Skipping CUDA generation step"
    }
 }
 function build_oneapi() {
  if ((-not "${env:OLLAMA_SKIP_ONEAPI_GENERATE}") -and ("${env:ONEAPI_ROOT}"))  {
    # Get oneAPI version
    $script:ONEAPI_VERSION = icpx --version
    $script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?<version>\d+\.\d+\.\d+)').Value
    if ($null -ne $script:ONEAPI_VERSION) {
      $script:ONEAPI_VARIANT = "_v" + $script:ONEAPI_VERSION
    }
    init_vars
    $script:buildDir = "../build/windows/${script:ARCH}/oneapi$script:ONEAPI_VARIANT"
    $script:distDir ="$script:DIST_BASE\oneapi$script:ONEAPI_VARIANT"
    $script:cmakeDefs += @(
      "-G", "MinGW Makefiles",
      "-DGGML_SYCL=ON",
      "-DCMAKE_C_COMPILER=icx",
      "-DCMAKE_CXX_COMPILER=icx",
      "-DCMAKE_BUILD_TYPE=Release"
    )
    Write-Host "Building oneAPI"
    build
    # Ninja doesn't prefix with config name
    if ($null -ne $script:DUMPBIN) {
      & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | Select-String ".dll"
    }
    sign
    install
    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
  } else {
    Write-Host "Skipping oneAPI generation step"
  }
 }
 function build_rocm() {
    if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
        $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
        if ($null -ne $script:ROCM_VERSION) {
            $script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
        }
        init_vars
        $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
        $script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
        $script:cmakeDefs += @(
            "-G", "Ninja",
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DGGML_HIPBLAS=on",
            "-DHIP_PLATFORM=amd",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
            "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
            "-DAMDGPU_TARGETS=$(amdGPUs)",
            "-DGPU_TARGETS=$(amdGPUs)"
            )
        # Make sure the ROCm binary dir is first in the path
        $env:PATH="$env:HIP_PATH\bin;$env:PATH"
        # We have to clobber the LIB var from the developer shell for clang to work properly
        $env:LIB=""
        if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
            write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
            $script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
            write-host "building custom ROCM GPU"
        }
        write-host "Building ROCm"
        build
        # Ninja doesn't prefix with config name
        ${script:config}=""
        if ($null -ne $script:DUMPBIN) {
            & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
        }
        sign
        install
        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
        # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
    } else {
        write-host "Skipping ROCm generation step"
    }
 }
 init_vars
 if ($($args.count) -eq 0) {
    git_module_setup
    apply_patches
    if ($script:ARCH -eq "arm64") {
        build_cpu_arm64
    } else { # amd64
        build_cpu_x64
        build_cpu_avx
        build_cpu_avx2
        build_cuda
        build_oneapi
        build_rocm
    }
    cleanup
    write-host "`ngo generate completed.  LLM runners: $(get-childitem -path $script:DIST_BASE)"
 } else {
    for ( $i = 0; $i -lt $args.count; $i++ ) {
        write-host "performing $($args[$i])"
        & $($args[$i])
    }
 }
--- a/llm/generate/generate_darwin.go
+++ b/llm/generate/generate_darwin.go
@ -1,3 +0,0 @@
 package generate
 //go:generate bash ./gen_darwin.sh
--- a/llm/generate/generate_linux.go
+++ b/llm/generate/generate_linux.go
@ -1,3 +0,0 @@
 package generate
 //go:generate bash ./gen_linux.sh
--- a/llm/generate/generate_windows.go
+++ b/llm/generate/generate_windows.go
@ -1,3 +0,0 @@
 package generate
 //go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@ -1 +0,0 @@
 Subproject commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555
--- a/llm/patches/0000-cmakelist.patch
+++ b/llm/patches/0000-cmakelist.patch
@ -1,22 +0,0 @@
 From 7a3555098d4591c9b329c677654497ed8cee07ec Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Fri, 23 Aug 2024 11:27:48 -0700
 Subject: [PATCH] patch cmakelist
 ---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)
 diff --git a/CMakeLists.txt b/CMakeLists.txt
 index 415743c2..aaadd13e 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
@@ -210,3 +210,5 @@ if (LLAMA_BUILD_EXAMPLES)
     add_subdirectory(examples)
     add_subdirectory(pocs)
 endif()
 +
 +add_subdirectory(../ext_server ext_server) # ollama
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0001-load-progress.patch
+++ b/llm/patches/0001-load-progress.patch
@ -1,44 +0,0 @@
 From c97ed60c3369294d5551ba099a88ddc509687df1 Mon Sep 17 00:00:00 2001
 From: Gabe Goodhart <ghart@us.ibm.com>
 Date: Thu, 19 Sep 2024 16:55:15 -0600
 Subject: [PATCH] patch load progress
 ---
 common/common.cpp | 2 ++
 common/common.h   | 7 +++++++
 2 files changed, 9 insertions(+)
 diff --git a/common/common.cpp b/common/common.cpp
 index 8d0ed4f9..a09e8a53 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
@@ -955,6 +955,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
 +    mparams.progress_callback = params.progress_callback;
 +    mparams.progress_callback_user_data = params.progress_callback_user_data;
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
     } else {
 diff --git a/common/common.h b/common/common.h
 index cb87c447..818a4a4a 100644
 --- a/common/common.h
 +++ b/common/common.h
@@ -266,6 +266,13 @@ struct gpt_params {
     std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
     std::vector<std::string> image; // path to image file(s)
 +    // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
 +    // If the provided progress_callback returns true, model loading continues.
 +    // If it returns false, model loading is immediately aborted.
 +    llama_progress_callback progress_callback = NULL;
 +    // context pointer passed to the progress callback
 +    void * progress_callback_user_data;
 +
     // embedding
     bool embedding         = false; // get only sentence embedding
     int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0002-clip-log.patch
+++ b/llm/patches/0002-clip-log.patch
@ -1,24 +0,0 @@
 From 6fdf4268e13e56f0050fa6a29b029cbd54be49d2 Mon Sep 17 00:00:00 2001
 From: Gabe Goodhart <ghart@us.ibm.com>
 Date: Thu, 19 Sep 2024 16:58:03 -0600
 Subject: [PATCH] clip log
 ---
 examples/llava/clip.cpp | 1 +
 1 file changed, 1 insertion(+)
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
 index 8aa7b075..b8941c74 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
 +#include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0003-load_exception.patch
+++ b/llm/patches/0003-load_exception.patch
@ -1,57 +0,0 @@
 From 4f2b9cd0f012c49f40d0784454864ad41ca418b2 Mon Sep 17 00:00:00 2001
 From: Gabe Goodhart <ghart@us.ibm.com>
 Date: Thu, 19 Sep 2024 17:00:28 -0600
 Subject: [PATCH] load exception
 ---
 src/llama.cpp | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)
 diff --git a/src/llama.cpp b/src/llama.cpp
 index af8afd84..4d1db3d5 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -8871,7 +8871,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         }
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
 -        return -1;
 +        throw;
     }
     // loading time will be recalculate after the first eval, so
@@ -18675,16 +18675,23 @@ struct llama_model * llama_load_model_from_file(
         }
         model->rpc_servers.push_back(servers);
     }
 -    int status = llama_model_load(path_model, *model, params);
 -    GGML_ASSERT(status <= 0);
 -    if (status < 0) {
 -        if (status == -1) {
 -            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
 -        } else if (status == -2) {
 -            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
 +
 +    try {
 +        int status = llama_model_load(path_model, *model, params);
 +        GGML_ASSERT(status <= 0);
 +        if (status < 0) {
 +            if (status == -1) {
 +                LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
 +            } else if (status == -2) {
 +                LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
 +            }
 +            delete model;
 +            return nullptr;
         }
 +    } catch (...) {
 +        LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
         delete model;
 -        return nullptr;
 +        throw;
     }
     return model;
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0004-metal.patch
+++ b/llm/patches/0004-metal.patch
@ -1,57 +0,0 @@
 From 91d3f886f1645b38d9658c0e125603e8d5338146 Mon Sep 17 00:00:00 2001
 From: nobody <>
 Date: Tue, 1 Oct 2024 13:55:01 -0600
 Subject: [PATCH] metal
 ---
 ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)
 diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
 index 9da08fe2..3a433703 100644
 --- a/ggml/src/ggml-metal.m
 +++ b/ggml/src/ggml-metal.m
@@ -1720,27 +1720,23 @@ static void ggml_metal_encode_node(
                 // to the matrix-vector kernel
                 int ne11_mm_min = 1;
 -#if 0
                 // the numbers below are measured on M2 Ultra for 7B and 13B models
                 // these numbers do not translate to other devices or model sizes
                 // TODO: need to find a better approach
 -                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
 -                            switch (src0t) {
 -                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
 -                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
 -                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
 -                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
 -                                case GGML_TYPE_Q4_0:
 -                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
 -                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
 -                                case GGML_TYPE_Q5_0:                          // not tested yet
 -                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
 -                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
 -                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
 -                                default:             ne11_mm_min = 1;  break;
 -                            }
 +                        switch (src0t) {
 +                            case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
 +                            case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
 +                            case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
 +                            case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
 +                            case GGML_TYPE_Q4_0:
 +                            case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
 +                            case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
 +                            case GGML_TYPE_Q5_0:                          // not tested yet
 +                            case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
 +                            case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
 +                            case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
 +                            default:             ne11_mm_min = 1;  break;
                         }
 -#endif
                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0005-default-pretokenizer.patch
+++ b/llm/patches/0005-default-pretokenizer.patch
@ -1,44 +0,0 @@
 From 0e531d69786c4a96a3a2bcf7b2d576bd6f7edf25 Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:13 -0700
 Subject: [PATCH] 05-default-pretokenizer.diff
 ---
 src/llama.cpp | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)
 diff --git a/src/llama.cpp b/src/llama.cpp
 index 4c0a1bb6..800dfb95 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -6287,16 +6287,7 @@ static void llm_load_vocab(
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
 -            if (tokenizer_pre.empty()) {
 -                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
 -                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
 -                LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
 -                LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
 -                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
 -                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
 -            } else if (tokenizer_pre == "default") {
 +            if (tokenizer_pre == "default") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
@@ -6398,7 +6389,8 @@ static void llm_load_vocab(
                 vocab.tokenizer_add_bos = true;
                 vocab.tokenizer_clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
 +                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             }
         } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0006-embeddings.patch
+++ b/llm/patches/0006-embeddings.patch
@ -1,54 +0,0 @@
 From 235b6d876a74cb09abe26985fa89ebe5bfc9f562 Mon Sep 17 00:00:00 2001
 From: Gabe Goodhart <ghart@us.ibm.com>
 Date: Thu, 19 Sep 2024 17:06:17 -0600
 Subject: [PATCH] embeddings
 ---
 src/llama.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)
 diff --git a/src/llama.cpp b/src/llama.cpp
 index 1a8e0c51..e55ec3f8 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -16516,7 +16516,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
     // TODO: use a per-batch flag for logits presence instead
 -    const bool has_logits = !cparams.embeddings;
 +    const bool has_logits =  cparams.causal_attn;
     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -16794,20 +16794,23 @@ static int llama_decode_internal(
             // no output
             res  = nullptr;
             embd = nullptr;
 -        } else if (cparams.embeddings) {
 -            res  = nullptr; // do not extract logits for embedding case
 -            embd = nullptr;
 +        }
 +
 +        if (cparams.embeddings) {
             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
 +                embd = ggml_graph_node(gf, i);
                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
 -                    embd = ggml_graph_node(gf, i);
                     break;
                 }
             }
 -            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
         } else {
             embd = nullptr; // do not extract embeddings when not needed
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
 +
 +        if (!cparams.causal_attn) {
 +            res = nullptr; // do not extract logits when not needed
 +        }
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0007-clip-unicode.patch
+++ b/llm/patches/0007-clip-unicode.patch
@ -1,54 +0,0 @@
 From 01c42149cbdc194644a2f138598029938e0dd447 Mon Sep 17 00:00:00 2001
 From: Gabe Goodhart <ghart@us.ibm.com>
 Date: Thu, 19 Sep 2024 17:09:57 -0600
 Subject: [PATCH] clip unicode
 ---
 examples/llava/clip.cpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
 index b8941c74..3a735f17 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -40,6 +40,14 @@
 #include <cinttypes>
 #include <limits>
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
 +#ifndef NOMINMAX
 +    #define NOMINMAX
 +#endif
 +#include <windows.h>
 +#endif
 +
 #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
 #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
 #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
@@ -1227,7 +1235,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             return nullptr;
         }
 +#ifdef _WIN32
 +        int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
 +        if (!wlen) {
 +            return NULL;
 +        }
 +        wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
 +        wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
 +        if (!wlen) {
 +            free(wbuf);
 +            return NULL;
 +        }
 +        auto fin = std::ifstream(wbuf, std::ios::binary);
 +        free(wbuf);
 +#else
         auto fin = std::ifstream(fname, std::ios::binary);
 +#endif
         if (!fin) {
             LOG_ERR("cannot open model file for loading tensors\n");
             clip_free(new_clip);
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0008-solar-pro.patch
+++ b/llm/patches/0008-solar-pro.patch
@ -1,412 +0,0 @@
 From a8fe40fa7b026d2db9bb6aeecd24fcd2027110ec Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:16 -0700
 Subject: [PATCH] add solar-pro support
 solar-pro introduces block skip connections where blocks are connected
 to other, non-sequential blocks with a scale multiple
 this change adds 4 new keys to store the skip connections and one new
 tensor to store the scalar. the scalar is implemented a 1-dimensional
 tensor with 2 elements dervied from the model's bskcn_tv configuration.
 in general, the values are (bskcn_tv, 1 - bskcn_tv)
 ---
 src/llama.cpp | 270 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 255 insertions(+), 15 deletions(-)
 diff --git a/src/llama.cpp b/src/llama.cpp
 index 4c0a1bb6..c6fc0c3f 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -217,6 +217,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
 +    LLM_ARCH_SOLAR,
     LLM_ARCH_UNKNOWN,
 };
@@ -270,6 +271,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,         "granite"      },
     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
     { LLM_ARCH_CHAMELEON,       "chameleon"    },
 +    { LLM_ARCH_SOLAR,           "solar"        },
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
@@ -327,6 +329,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
 +    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_FREQ_BASE,
@@ -421,20 +424,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
 -    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
 -    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
 -    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"         },
 -    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"              },
 -    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"             },
 -    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
 -    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
 -    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
 -    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
 -    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
 -    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
 -    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
 -    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
 -    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
 +    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"               },
 +    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"            },
 +    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"           },
 +    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"                },
 +    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"               },
 +    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"             },
 +    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"       },
 +    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon"   },
 +    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                   },
 +    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"              },
 +    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"             },
 +    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count"   },
 +    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"           },
 +    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                    },
 +    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
     { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
@@ -608,6 +612,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
 +    LLM_TENSOR_BSKCN_TV,
 };
 static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -1527,6 +1532,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
 +
 +    {
 +        LLM_ARCH_SOLAR,
 +        {
 +            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
 +            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
 +            { LLM_TENSOR_OUTPUT,          "output" },
 +            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
 +            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
 +            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
 +            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
 +            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
 +            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
 +            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
 +            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
 +            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
 +            { LLM_TENSOR_BSKCN_TV,        "bskcn_tv" },
 +        },
 +    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -2360,6 +2384,7 @@ enum e_model {
     MODEL_15B,
     MODEL_16B,
     MODEL_20B,
 +    MODEL_22B,
     MODEL_30B,
     MODEL_34B,
     MODEL_35B,
@@ -2409,6 +2434,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 +    std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
 +
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q = 0;
     uint32_t n_lora_kv = 0;
@@ -2479,6 +2506,7 @@ struct llama_hparams {
         if (this->n_head_arr    != other.n_head_arr)    return true;
         if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
         if (this->n_ff_arr      != other.n_ff_arr)      return true;
 +        if (this->n_bskcn_arr   != other.n_bskcn_arr)   return true;
         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2588,6 +2616,14 @@ struct llama_hparams {
             return ssm_d_state * ssm_d_inner;
         }
     }
 +
 +    bool n_bskcn(uint32_t n, uint32_t il = 0) const {
 +        if (il < n_layer) {
 +            return n_bskcn_arr[n][il] > 0;
 +        }
 +
 +        GGML_ABORT("fatal error");
 +    }
 };
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2769,6 +2805,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_gate_scale;
     struct ggml_tensor * ffn_up_scale;
     struct ggml_tensor * ffn_down_scale;
 +
 +    struct ggml_tensor * bskcn_tv;
 };
 // very similar to llama_batch,
@@ -6134,6 +6172,21 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                }
             } break;
 +        case LLM_ARCH_SOLAR:
 +            {
 +                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 +
 +                for (int i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
 +                    auto & bskcn = hparams.n_bskcn_arr.at(i);
 +                    bskcn.fill(0);
 +                    ml.get_key_or_arr(::format(LLM_KV_NAMES.at(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION), LLM_ARCH_NAMES.at(ml.llm_kv.arch), i), bskcn, hparams.n_layer, false);
 +                }
 +
 +                switch (hparams.n_layer) {
 +                    case 64: model.type = e_model::MODEL_22B; break;
 +                    default: model.type = e_model::MODEL_UNKNOWN;
 +                }
 +            }
         default: (void)0;
     }
@@ -8839,6 +8892,37 @@ static bool llm_load_tensors(
                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 +                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
 +                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
 +                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
 +                    }
 +                } break;
 +            case LLM_ARCH_SOLAR:
 +                {
 +                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 +
 +                    // output
 +                    {
 +                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
 +                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 +                    }
 +
 +                    for (int i = 0; i < n_layer; ++i) {
 +                        ggml_context * ctx_layer = ctx_for_layer(i);
 +                        ggml_context * ctx_split = ctx_for_layer_split(i);
 +
 +                        auto & layer = model.layers[i];
 +
 +                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 +                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
 +                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
 +                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
 +                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
 +
 +                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 +
 +                        layer.bskcn_tv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
 +
                         layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
@@ -16009,7 +16093,6 @@ struct llm_build_context {
         return gf;
     }
 -
     // ref: https://github.com/facebookresearch/chameleon
     // based on the original build_llama() function, changes:
     //   * qk-norm
@@ -16187,6 +16270,158 @@ struct llm_build_context {
         return gf;
     }
 +
 +    ggml_cgraph * build_solar() {
 +        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 +
 +        // mutable variable, needed during the last layer of the computation to skip unused tokens
 +        int32_t n_tokens = this->n_tokens;
 +
 +        const int64_t n_embd_head = hparams.n_embd_head_v;
 +        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 +        GGML_ASSERT(n_embd_head == hparams.n_rot);
 +
 +        struct ggml_tensor * cur;
 +        struct ggml_tensor * inpL;
 +
 +        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
 +
 +        // inp_pos - contains the positions
 +        struct ggml_tensor * inp_pos = build_inp_pos();
 +
 +        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
 +        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
 +
 +        struct ggml_tensor * bskcn_1;
 +        struct ggml_tensor * bskcn_2;
 +
 +        for (int il = 0; il < n_layer; ++il) {
 +            struct ggml_tensor * inpSA = inpL;
 +
 +            if (hparams.n_bskcn(0, il)) {
 +                bskcn_1 = inpSA;
 +            }
 +
 +            if (hparams.n_bskcn(1, il)) {
 +                bskcn_2 = inpSA;
 +            }
 +
 +            if (hparams.n_bskcn(2, il)) {
 +                inpSA = ggml_add(
 +                   ctx0,
 +                   ggml_mul(ctx0, bskcn_1, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
 +                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
 +            }
 +
 +            if (hparams.n_bskcn(3, il)) {
 +                inpSA = ggml_add(
 +                   ctx0,
 +                   ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
 +                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
 +            }
 +
 +            // norm
 +            cur = llm_build_norm(ctx0, inpL, hparams,
 +                    model.layers[il].attn_norm, NULL,
 +                    LLM_NORM_RMS, cb, il);
 +            cb(cur, "attn_norm", il);
 +
 +            // self-attention
 +            {
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
 +                struct ggml_tensor * rope_factors = build_rope_factors(il);
 +
 +                // compute Q and K and RoPE them
 +                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
 +                cb(Qcur, "Qcur", il);
 +                if (model.layers[il].bq) {
 +                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
 +                    cb(Qcur, "Qcur", il);
 +                }
 +
 +                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
 +                cb(Kcur, "Kcur", il);
 +                if (model.layers[il].bk) {
 +                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
 +                    cb(Kcur, "Kcur", il);
 +                }
 +
 +                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
 +                cb(Vcur, "Vcur", il);
 +                if (model.layers[il].bv) {
 +                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
 +                    cb(Vcur, "Vcur", il);
 +                }
 +
 +                Qcur = ggml_rope_ext(
 +                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
 +                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
 +                    ext_factor, attn_factor, beta_fast, beta_slow
 +                );
 +                cb(Qcur, "Qcur", il);
 +
 +                Kcur = ggml_rope_ext(
 +                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
 +                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
 +                    ext_factor, attn_factor, beta_fast, beta_slow
 +                );
 +                cb(Kcur, "Kcur", il);
 +
 +                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
 +                        model.layers[il].wo, model.layers[il].bo,
 +                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 +            }
 +
 +            if (il == n_layer - 1) {
 +                // skip computing output for unused tokens
 +                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
 +                n_tokens = n_outputs;
 +                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
 +                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
 +            }
 +
 +            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
 +            cb(ffn_inp, "ffn_inp", il);
 +
 +            // feed-forward network
 +            cur = llm_build_norm(ctx0, ffn_inp, hparams,
 +                    model.layers[il].ffn_norm, NULL,
 +                    LLM_NORM_RMS, cb, il);
 +            cb(cur, "ffn_norm", il);
 +
 +            cur = llm_build_ffn(ctx0, lctx, cur,
 +                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
 +                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
 +                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
 +                    NULL,
 +                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
 +            cb(cur, "ffn_out", il);
 +
 +            cur = ggml_add(ctx0, cur, ffn_inp);
 +            cb(cur, "ffn_out", il);
 +
 +            cur = lctx.cvec.apply_to(ctx0, cur, il);
 +            cb(cur, "l_out", il);
 +
 +            // input for next layer
 +            inpL = cur;
 +        }
 +
 +        cur = inpL;
 +
 +        cur = llm_build_norm(ctx0, cur, hparams,
 +                model.output_norm, NULL,
 +                LLM_NORM_RMS, cb, -1);
 +        cb(cur, "result_norm", -1);
 +
 +        // lm_head
 +        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
 +        cb(cur, "result_output", -1);
 +
 +        ggml_build_forward_expand(gf, cur);
 +
 +        return gf;
 +    }
 };
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16451,6 +16686,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_chameleon();
             } break;
 +        case LLM_ARCH_SOLAR:
 +            {
 +                result = llm.build_solar();
 +            } break;
         default:
             GGML_ABORT("fatal error");
     }
@@ -19594,6 +19833,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:
 +        case LLM_ARCH_SOLAR:
             return LLAMA_ROPE_TYPE_NORM;
         // the pairs of head values are offset by n_rot/2
 -- 
 2.39.3 (Apple Git-146)
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@ -7,15 +7,9 @@ set -e
 mkdir -p dist
 for TARGETARCH in arm64 amd64; do
-    if [ -n "${OLLAMA_NEW_RUNNERS}" ]; then
+    echo "Building Go runner darwin $TARGETARCH"
-        echo "Building Go runner darwin $TARGETARCH"
+    rm -rf llama/build
-        rm -rf llama/build
+    GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
        GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
    else
        echo "Building C++ runner darwin $TARGETARCH"
        rm -rf llm/build
        GOOS=darwin GOARCH=$TARGETARCH go generate ./...
    fi
    # These require Xcode v13 or older to target MacOS v11
    # If installed to an alternate location use the following to enable
    # export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
@ -19,7 +19,7 @@ docker buildx build \
    ${LOAD_OR_PUSH} \
    --platform=${PLATFORM} \
    ${OLLAMA_COMMON_BUILD_ARGS} \
-    -f ${DOCKERFILE_DIR}Dockerfile \
+    -f Dockerfile \
    -t ${FINAL_IMAGE_REPO}:$VERSION \
    .
@ -29,7 +29,7 @@ if echo $PLATFORM | grep "amd64" > /dev/null; then
        --platform=linux/amd64 \
        ${OLLAMA_COMMON_BUILD_ARGS} \
        --target runtime-rocm \
-        -f ${DOCKERFILE_DIR}Dockerfile \
+        -f Dockerfile \
        -t ${FINAL_IMAGE_REPO}:$VERSION-rocm \
        .
 fi
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@ -19,7 +19,7 @@ docker buildx build \
        --platform=${PLATFORM} \
        ${OLLAMA_COMMON_BUILD_ARGS} \
        --target dist \
-        -f ${DOCKERFILE_DIR}Dockerfile \
+        -f Dockerfile \
        .
 # buildx behavior changes for single vs. multiplatform
--- a/scripts/build_remote.py
+++ b/scripts/build_remote.py
@ -1,76 +0,0 @@
 #!/usr/bin/env python3
 import subprocess
 import sys
 from urllib.parse import urlparse
 from git import Repo
 # Helper script to be able to build on remote repos using git to push local changes
 # (e.g. particularly helpful to target a remote windows build system)
 #
 # Typical windows remote git config looks like this:
 #
 #[remote "windows-pa"]
 #        url = jdoe@desktop-foo:C:/Users/Jdoe/code/ollama
 #        fetch = +refs/heads/*:refs/remotes/windows-pa/*
 #        uploadpack = powershell git upload-pack
 #        receivepack = powershell git receive-pack
 #
 # TODO - add argpare and make this more configurable 
 # - force flag becomes optional
 # - generate, build or test ...
 # Note: remote repo will need this run once:
 # git config --local receive.denyCurrentBranch updateInstead
 repo = Repo(".")
 # On linux, add links in /usr/local/bin to the go binaries to avoid needing this
 # GoCmd = "/usr/local/go/bin/go" 
 GoCmd = "go" 
 if repo.is_dirty():
    print("Tree is dirty.  Commit your changes before running this script")
    sys.exit(1)
 if len(sys.argv) != 2:
    print("Please specify the remote name: " + ', '.join([r.name for r in repo.remotes]))
    sys.exit(1)
 remote_name = sys.argv[1]
 remote = {r.name: r for r in repo.remotes}[remote_name]
 raw_url = list(remote.urls)[0]
 url = urlparse(raw_url)
 # Windows urls don't quite parse properly
 if url.scheme == "" and url.netloc == "":
    url = urlparse("ssh://" + raw_url)
 print("URL: " + str(url))
 netloc = url.netloc.split(":")[0]
 path = url.path
 branch_name = repo.active_branch.name
 print("Force pushing content to remote...")
 # Use with care given the force push
 remote.push(force=True).raise_if_error()
 print("Ensuring correct branch checked out on remote via ssh...")
 subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'git', 'checkout', branch_name])
 # TODO - add some hardening to try to figure out how to set up the path properly
 # subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'env'])
 # TODO - or consider paramiko maybe
 print("Running Windows Build Script")
 subprocess.check_call(['ssh', netloc, 'cd', path, ';', "powershell", "-ExecutionPolicy", "Bypass", "-File", "./scripts/build_windows.ps1"])
 # print("Building")
 # subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.'])
 print("Copying built result")
 subprocess.check_call(['scp', netloc +":"+ path + "/ollama.exe",  './dist/'])
 print("Copying installer")
 subprocess.check_call(['scp', netloc +":"+ path + "/dist/Ollama Setup.exe",  './dist/'])
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@ -83,50 +83,7 @@ function buildOllama() {
    if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
        write-host "Building ollama runners"
        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
-        if ($null -eq ${env:OLLAMA_NEW_RUNNERS}) {
+        & make -C llama -j 12
            # Start by skipping CUDA to build everything else
            write-host "Building ollama runners"
            powershell -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
            # Then skip everyhting else and build all the CUDA variants
            foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
                write-host "Building CUDA ${env:CUDA_LIB_DIR} runner"
                if ($env:CUDA_LIB_DIR.Contains("v12")) {
                    powershell -Command {
                        $env:OLLAMA_SKIP_CUDA_GENERATE=""
                        $env:OLLAMA_SKIP_STATIC_GENERATE="1"
                        $env:OLLAMA_SKIP_CPU_GENERATE="1"
                        $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
                        $env:OLLAMA_SKIP_ROCM_GENERATE="1"
                        $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
                        $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on"
                        $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
                        $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
                        & go generate ./...
                    }
                } else {
                    powershell -Command {
                        $env:OLLAMA_SKIP_CUDA_GENERATE=""
                        $env:OLLAMA_SKIP_STATIC_GENERATE="1"
                        $env:OLLAMA_SKIP_CPU_GENERATE="1"
                        $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
                        $env:OLLAMA_SKIP_ROCM_GENERATE="1"
                        $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
                        $env:OLLAMA_CUSTOM_CUDA_DEFS=""
                        $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
                        $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
                        & go generate ./...
                    }
                }
                if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            }
        } else {
            & make -C llama -j 12
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    } else {
        write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
@ -172,7 +129,7 @@ function gatherDependencies() {
    } else {
        $depArch=$script:TARGET_ARCH
    }
-    if ($depArch -eq "amd64") {
+    if ($depArch -eq "x64") {
        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DIST_DIR}\lib\ollama\"
        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DIST_DIR}\lib\ollama\"
        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DIST_DIR}\lib\ollama\"
--- a/scripts/env.sh
+++ b/scripts/env.sh
@ -20,12 +20,6 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
    --build-arg=CUSTOM_CPU_FLAGS \
    --build-arg=GPU_RUNNER_CPU_FLAGS \
    --build-arg=AMDGPU_TARGETS"
 OLLAMA_NEW_RUNNERS=${OLLAMA_NEW_RUNNERS:-""}
 if [ -n "${OLLAMA_NEW_RUNNERS}" ]; then
    DOCKERFILE_DIR="./llama/"
 else
    DOCKERFILE_DIR="./"
 fi
 echo "Building Ollama"
 echo "VERSION=$VERSION"
		`@ -0,0 +1 @@`
							`LLAMACPP_BASE_COMMIT=3f1ae2e32cde00c39b96be6d01c2997c29bae555`
		`@ -1,3 +0,0 @@`
			`package generate`

			`//go:generate bash ./gen_darwin.sh`
		`@ -1,3 +0,0 @@`
			`package generate`

			`//go:generate bash ./gen_linux.sh`
		`@ -1,3 +0,0 @@`
			`package generate`

			`//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1`
		`@ -1 +0,0 @@`
			`Subproject commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555`