Merge https://github.com/ollama/ollama

Signed-off-by: baalajimaestro <baalajimaestro@ptr.moe>
2024-11-10 22:43:23 +05:30 · 2024-11-10 22:43:23 +05:30 · aa3a7dea34
commit aa3a7dea34
parent 77dfd8ba16 c2e8cbaa14
85 changed files with 3458 additions and 15884 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -3,9 +3,7 @@ ollama
 app
 macapp
 dist
-llm/llama.cpp
 .env
 .cache
 test_data
-llm/build
 llama/build
--- a/.gitattributes
+++ b/.gitattributes
@ -1,4 +1,3 @@
-llm/ext_server/* linguist-vendored
 llama/**/*.cpp linguist-vendored
 llama/**/*.hpp linguist-vendored
 llama/**/*.h linguist-vendored
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -1,5 +1,9 @@
 name: release

+env:
+  ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
+  MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
+
 on:
  push:
    tags:
@ -8,7 +12,7 @@ on:
 jobs:
  # Full build of the Mac assets
  build-darwin:
-    runs-on: macos-12
+    runs-on: macos-13
    environment: release
    steps:
      - uses: actions/checkout@v4
@ -39,8 +43,8 @@ jobs:
          APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
          APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
          APPLE_ID: ${{ vars.APPLE_ID }}
-          SDKROOT: /Applications/Xcode_13.4.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
-          DEVELOPER_DIR: /Applications/Xcode_13.4.1.app/Contents/Developer
+          SDKROOT: /Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
+          DEVELOPER_DIR: /Applications/Xcode_14.1.0.app/Contents/Developer
        run: |
          ./scripts/build_darwin.sh

@ -48,8 +52,8 @@ jobs:
        with:
          name: dist-darwin
          path: |
-            dist/*arwin*
-            !dist/*-cov
+            dist/Ollama-darwin.zip
+            dist/ollama-darwin

  # Windows builds take a long time to both install the dependencies and build, so parallelize
  # CPU generation step
@ -60,51 +64,34 @@ jobs:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
+      - name: Set make jobs default
+        run: |
+          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          project_id: 'ollama'
-          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
-      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
-      - name: install Windows SDK 8.1 to get signtool
+      - name: Add msys paths
        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading SDK"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-          write-host "Win SDK 8.1 installed"
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
-      - name: install signing plugin
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading plugin"
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
-          write-host "Installing plugin"
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
-          write-host "plugin installed"
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
-      - run: go get ./...
      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$env:PATH"
-          go generate -x ./...
-        name: go generate
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
+          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
+          make
+        name: make
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cpu
          path: |
            build/**/*
-            build/**/*.a
-            llm/build/**/*.a
            dist/windows-amd64/**

  # ROCm generation step
@ -115,74 +102,55 @@ jobs:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
+      - name: Set make jobs default
+        run: |
+          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          project_id: 'ollama'
-          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
-      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
-      - name: install Windows SDK 8.1 to get signtool
+      - name: Add msys paths
        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading SDK"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-          write-host "Win SDK 8.1 installed"
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
-      - name: install signing plugin
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading plugin"
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
-          write-host "Installing plugin"
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
-          write-host "plugin installed"
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
-      - name: 'Install ROCm'
+      # ROCM installation steps
+      - name: 'Cache ROCm installer'
+        id: cache-rocm
+        uses: actions/cache@v4
+        with:
+          path: rocm-install.exe
+          key: ${{ env.ROCM_WINDOWS_URL }}
+      - name: 'Conditionally Download ROCm'
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
        run: |
          $ErrorActionPreference = "Stop"
-          write-host "downloading AMD HIP Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP"
+          Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
+      - name: 'Install ROCm'
+        run: |
+          Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
      - name: 'Verify ROCm'
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-      - run: go get ./...
-      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$env:PATH"
-          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          go generate -x ./...
-        name: go generate
-      - name: 'gather rocm dependencies'
+          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+      - name: make rocm runner
        run: |
-          $HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          md "dist\deps\bin\rocblas\library"
-          cp "${HIP_PATH}\bin\hipblas.dll" "dist\deps\bin\"
-          cp "${HIP_PATH}\bin\rocblas.dll" "dist\deps\bin\"
-          cp "${HIP_PATH}\bin\rocblas\library\*" "dist\deps\bin\rocblas\library\"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
+          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
+          make -C llama print-HIP_PATH print-HIP_LIB_DIR
+          make rocm
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-rocm
          path: |
            build/**/*
            dist/windows-amd64/**
-      - uses: actions/upload-artifact@v4
-        with:
-          name: windows-rocm-deps
-          path: dist/deps/*

  # CUDA generation step
  generate-windows-cuda:
@ -191,88 +159,80 @@ jobs:
    strategy:
      matrix:
        cuda:
-          - version: "11"
-            url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
-          - version: "12"
-            url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
+          - version: "11.3"
+            url: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+          - version: "12.4"
+            url: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
    env:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
+      - name: Set make jobs default
+        run: |
+          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          project_id: 'ollama'
-          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
-      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
-      - name: install Windows SDK 8.1 to get signtool
+      - name: Install msys2
        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading SDK"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-          write-host "Win SDK 8.1 installed"
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
-      - name: install signing plugin
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading plugin"
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
-          write-host "Installing plugin"
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
-          write-host "plugin installed"
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
-      - name: 'Install CUDA ${{ matrix.cuda.version }}'
+      # CUDA installation steps
+      - name: 'Cache CUDA installer'
+        id: cache-cuda
+        uses: actions/cache@v4
+        with:
+          path: cuda-install.exe
+          key: ${{ matrix.cuda.url }}
+      - name: 'Conditionally Download CUDA'
+        if: steps.cache-cuda.outputs.cache-hit != 'true'
        run: |
          $ErrorActionPreference = "Stop"
-          write-host "downloading CUDA Installer"
-          Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
-          write-host "Installing CUDA"
-          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
-          write-host "Completed CUDA"
+          Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "cuda-install.exe"
+      - name: 'Install CUDA'
+        run: |
+          $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ matrix.cuda.version }}"}
+          Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
+      - name: 'Verify CUDA'
+        run: |
+          & (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
-          echo "$cudaPath\bin" >> $env:GITHUB_PATH
-          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
-      - name: 'Verify CUDA'
-        run: nvcc -V
-      - run: go get ./...
-      - name: go generate
+          echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+          echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+
+      - name: make cuda runner
        run: |
-          $gopath=(get-command go).source | split-path -parent
-          $cudabin=(get-command nvcc).source | split-path
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$cudabin;$env:PATH"
-          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          go generate -x ./...
-      - name: 'gather cuda dependencies'
-        run: |
-          $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
-          md "dist\deps"
-          cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\"
-          cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\"
-          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
+          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
+          make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cuda-${{ matrix.cuda.version }}
          path: |
            build/**/*
            dist/windows-amd64/**
-      - uses: actions/upload-artifact@v4
-        with:
-          name: windows-cuda-deps-${{ matrix.cuda.version }}
-          path: dist/deps/*
-

  # windows arm64 generate, go build, and zip file (no installer)
  # Output of this build is aggregated into the final x86 build
@ -292,6 +252,30 @@ jobs:
          choco install -y --no-progress git gzip
          echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      # pacman is buggy on win arm64, so we avoid using it, but rely on the binary artifacts
+      # we download the sfx (7zip bundle) which isn't fully set up, but the binaries we need to build work
+      - name: Install msys2 x64
+        run: |
+          $url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-base-x86_64-20240727.sfx.exe"
+          write-host "Downloading MSYS2"
+          Invoke-WebRequest -Uri "$url" -outfile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @(
+              '-y', '-oC:\'
+              ) -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      # since pacman isn't reliable, we just download the tar file and extract directly
+      - name: Downloading and extracting msys2 make tar file
+        run: |
+          $url="https://mirror.msys2.org/msys/x86_64/make-4.4.1-2-x86_64.pkg.tar.zst"
+          write-host "Downloading make"
+          Invoke-WebRequest -Uri "$url" -outfile c:\msys64\make.tar.zst
+          cd c:\msys64; tar -xf make.tar.zst
+          rm c:\msys64\make.tar.zst
+      - name: Verify Make works properly
+        run: |
+          echo $env:PATH
+          make --version
      - name: Install Visual Studio 2022
        run: |
          $components = @(
@ -385,13 +369,12 @@ jobs:
      - run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$gccpath;$env:PATH;C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin"
+          import-module 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\Program Files\Microsoft Visual Studio\2022\Community' -skipautomaticlocation
+          $env:PATH="$gopath;$gccpath;$env:PATH"
          echo $env:PATH
          $env:ARCH="arm64"
-          .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies distZip
+          .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies sign distZip
        name: 'Windows Build'
      - uses: actions/upload-artifact@v4
        with:
@ -441,6 +424,24 @@ jobs:
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@ -451,19 +452,10 @@ jobs:
          name: generate-windows-cpu
      - uses: actions/download-artifact@v4
        with:
-          name: generate-windows-cuda-11
+          name: generate-windows-cuda-11.3
      - uses: actions/download-artifact@v4
        with:
-          name: generate-windows-cuda-12
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-cuda-deps-11
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-cuda-deps-12
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-rocm-deps
+          name: generate-windows-cuda-12.4
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
@ -473,12 +465,11 @@ jobs:
          path: dist
      - run: dir build
      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$env:PATH"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:OLLAMA_SKIP_GENERATE="1"
+          $env:ARCH="amd64"
+          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
          & .\scripts\build_windows.ps1
      - uses: actions/upload-artifact@v4
        with:
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -1,5 +1,11 @@
 name: test

+env:
+  ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
+  MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
+  CUDA_12_WINDOWS_URL: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
+  CUDA_12_WINDOWS_VER: 12.4
+
 concurrency:
  # For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
  # cancels running CI jobs and starts all new ones.
@ -21,9 +27,6 @@ jobs:
  changes:
    runs-on: ubuntu-latest
    outputs:
-      GENERATE: ${{ steps.changes.outputs.GENERATE }}
-      GENERATE_CUDA: ${{ steps.changes.outputs.GENERATE_CUDA }}
-      GENERATE_ROCM: ${{ steps.changes.outputs.GENERATE_ROCM }}
      RUNNERS: ${{ steps.changes.outputs.RUNNERS }}
    steps:
      - uses: actions/checkout@v4
@ -39,53 +42,12 @@ jobs:
          }

          {
-            echo GENERATE=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
-            echo GENERATE_CUDA=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
-            echo GENERATE_ROCM=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
            echo RUNNERS=$(changed 'llama/**')
          } >>$GITHUB_OUTPUT

-  generate:
+  runners-linux-cuda:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE == 'True' }}
-    strategy:
-      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
-        arch: [amd64, arm64]
-        exclude:
-          - os: ubuntu-latest
-            arch: arm64
-          - os: windows-2019
-            arch: arm64
-    runs-on: ${{ matrix.os }}
-    env:
-      GOARCH: ${{ matrix.arch }}
-      CGO_ENABLED: '1'
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - run: go get ./...
-      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$gccpath;$env:PATH"
-          echo $env:PATH
-          go generate -x ./...
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        name: 'Windows Go Generate'
-      - run: go generate -x ./...
-        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        name: 'Unix Go Generate'
-      - run: go build .
-  generate-cuda:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    strategy:
      matrix:
        cuda-version:
@ -95,8 +57,6 @@ jobs:
    steps:
      - run: |
          apt-get update && apt-get install -y git build-essential curl
-          curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
-            | tar -zx -C /usr --strip-components 1
        env:
          DEBIAN_FRONTEND: noninteractive
      - uses: actions/checkout@v4
@ -107,12 +67,11 @@ jobs:
      - run: go get ./...
      - run: |
          git config --global --add safe.directory /__w/ollama/ollama
-          go generate -x ./...
-        env:
-          OLLAMA_SKIP_CPU_GENERATE: '1'
-  generate-rocm:
+          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
+          make -j $cores cuda_v11
+  runners-linux-rocm:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    strategy:
      matrix:
        rocm-version:
@ -122,8 +81,6 @@ jobs:
    steps:
      - run: |
          apt-get update && apt-get install -y git build-essential curl rocm-libs
-          curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
-            | tar -zx -C /usr --strip-components 1
        env:
          DEBIAN_FRONTEND: noninteractive
      - uses: actions/checkout@v4
@ -134,14 +91,13 @@ jobs:
      - run: go get ./...
      - run: |
          git config --global --add safe.directory /__w/ollama/ollama
-          go generate -x ./...
-        env:
-          OLLAMA_SKIP_CPU_GENERATE: '1'
+          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
+          make -j $cores rocm

  # ROCm generation step
-  generate-windows-rocm:
+  runners-windows-rocm:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    runs-on: windows
    steps:
      - uses: actions/checkout@v4
@ -149,35 +105,50 @@ jobs:
        with:
          go-version-file: go.mod
          cache: true
-      - name: 'Install ROCm'
+      - name: Set make jobs default
+        run: |
+          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+
+      # ROCM installation steps
+      - name: 'Cache ROCm installer'
+        id: cache-rocm
+        uses: actions/cache@v4
+        with:
+          path: rocm-install.exe
+          key: ${{ env.ROCM_WINDOWS_URL }}
+      - name: 'Conditionally Download ROCm'
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
        run: |
          $ErrorActionPreference = "Stop"
-          write-host "downloading AMD HIP Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP"
+          Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
+      - name: 'Install ROCm'
+        run: |
+          Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
      - name: 'Verify ROCm'
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-      - run: go get ./...
-      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$env:PATH"
-          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          go generate -x ./...
-        name: go generate
-        env:
-          OLLAMA_SKIP_CPU_GENERATE: '1'
+          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+
+      - name: Add msys paths
+        run: |
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
+
+      - name: make rocm runner
+        run: |
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
+          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
+          make -C llama print-HIP_PATH print-HIP_LIB_DIR
+          make rocm

  # CUDA generation step
-  generate-windows-cuda:
+  runners-windows-cuda:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    runs-on: windows
    steps:
      - uses: actions/checkout@v4
@ -185,37 +156,51 @@ jobs:
        with:
          go-version-file: go.mod
          cache: true
-      - name: 'Install CUDA'
+      - name: Set make jobs default
+        run: |
+          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+
+      # CUDA installation steps
+      - name: 'Cache CUDA installer'
+        id: cache-cuda
+        uses: actions/cache@v4
+        with:
+          path: cuda-install.exe
+          key: ${{ env.CUDA_12_WINDOWS_URL }}
+      - name: 'Conditionally Download CUDA'
+        if: steps.cache-cuda.outputs.cache-hit != 'true'
        run: |
          $ErrorActionPreference = "Stop"
-          write-host "downloading CUDA Installer"
-          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
-          write-host "Installing CUDA"
-          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
-          write-host "Completed CUDA"
+          Invoke-WebRequest -Uri "${env:CUDA_12_WINDOWS_URL}" -OutFile "cuda-install.exe"
+      - name: 'Install CUDA'
+        run: |
+          $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ env.CUDA_12_WINDOWS_VER }}"}
+          Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
+      - name: 'Verify CUDA'
+        run: |
+          & (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
-          echo "$cudaPath\bin" >> $env:GITHUB_PATH
-          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
-      - name: 'Verify CUDA'
-        run: nvcc -V
-      - run: go get ./...
-      - name: go generate
-        run: |
-          $gopath=(get-command go).source | split-path -parent
-          $cudabin=(get-command nvcc).source | split-path
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$cudabin;$env:PATH"
-          $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          go generate -x ./...
-        env:
-          OLLAMA_SKIP_CPU_GENERATE: '1'
+          echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+          echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append

-  runners:
+      - name: Add msys paths
+        run: |
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
+      - name: make cuda runner
+        run: |
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
+          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
+          make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
+
+  runners-cpu:
    needs: [changes]
    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    strategy:
@ -238,21 +223,30 @@ jobs:
        with:
          go-version-file: go.mod
          cache: true
-      - run: go get ./...
+      - name: Add msys paths
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        run: |
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
      - name: 'Build Windows Go Runners'
        if: ${{ startsWith(matrix.os, 'windows-') }}
        run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$gccpath;$env:PATH"
          echo $env:PATH
-          make -C llama -j 4      
+          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
+          make -j 4      
      - name: 'Build Unix Go Runners'
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        run: make -C llama -j 4
+        run: make -j 4
      - run: go build .

  lint:
@ -302,9 +296,6 @@ jobs:
    env:
      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: '1'
-      OLLAMA_CPU_TARGET: 'static'
-      OLLAMA_SKIP_CPU_GENERATE: '1'
-      OLLAMA_SKIP_METAL_GENERATE: '1'
    steps:
      - uses: actions/checkout@v4
        with:
@ -319,7 +310,6 @@ jobs:
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
-      - run: go generate ./...
      - run: go build
      - run: go test -v ./...

@ -333,4 +323,4 @@ jobs:
          submodules: recursive
      - name: Verify patches carry all the changes
        run: |
-          cd llama && make apply-patches sync && git diff --compact-summary --exit-code .
+          make apply-patches sync && git diff --compact-summary --exit-code llama
--- a/.gitmodules
+++ b/.gitmodules
@ -1,4 +0,0 @@
-[submodule "llama.cpp"]
-	path = llm/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
-	shallow = true
--- a/4
+++ b/4
@ -0,0 +1,4 @@
+GOALS := $(or $(MAKECMDGOALS),all)
+.PHONY: $(GOALS)
+$(GOALS):
+	$(MAKE) -C llama $@
--- a/README.md
+++ b/README.md
@ -12,7 +12,7 @@ Get up and running with large language models.

 [Download](https://ollama.com/download/Ollama-darwin.zip)

-### Windows preview
+### Windows

 [Download](https://ollama.com/download/OllamaSetup.exe)

@ -331,6 +331,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
+- [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
+- [Reddit Rate]((https://github.com/rapidarchitect/reddit_analyzer)) (Search and Rate Reddit topics with a weighted summation)

 ### Terminal

@ -454,6 +456,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
 - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
 - [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
+- [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)

 ### Supported backends

--- a/api/types.go
+++ b/api/types.go
@ -236,7 +236,7 @@ type Runner struct {
 	NumGPU    int   `json:"num_gpu,omitempty"`
 	MainGPU   int   `json:"main_gpu,omitempty"`
 	LowVRAM   bool  `json:"low_vram,omitempty"`
-	F16KV     bool  `json:"f16_kv,omitempty"`
+	F16KV     bool  `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
 	LogitsAll bool  `json:"logits_all,omitempty"`
 	VocabOnly bool  `json:"vocab_only,omitempty"`
 	UseMMap   *bool `json:"use_mmap,omitempty"`
@ -613,7 +613,6 @@ func DefaultOptions() Options {
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
 			NumThread: 0,  // let the runtime decide
 			LowVRAM:   false,
-			F16KV:     true,
 			UseMLock:  false,
 			UseMMap:   nil,
 		},
--- a/app/lifecycle/lifecycle.go
+++ b/app/lifecycle/lifecycle.go
@ -11,10 +11,12 @@ import (

 	"github.com/ollama/ollama/app/store"
 	"github.com/ollama/ollama/app/tray"
+	"github.com/ollama/ollama/envconfig"
 )

 func Run() {
 	InitLogging()
+	slog.Info("app config", "env", envconfig.Values())

 	ctx, cancel := context.WithCancel(context.Background())
 	var done chan int
--- a/app/lifecycle/paths.go
+++ b/app/lifecycle/paths.go
@ -36,8 +36,13 @@ func init() {
 		ServerLogFile = filepath.Join(AppDataDir, "server.log")
 		UpgradeLogFile = filepath.Join(AppDataDir, "upgrade.log")

-		// Executables are stored in APPDATA
+		exe, err := os.Executable()
+		if err != nil {
+			slog.Warn("error discovering executable directory", "error", err)
 			AppDir = filepath.Join(localAppData, "Programs", "Ollama")
+		} else {
+			AppDir = filepath.Dir(exe)
+		}

 		// Make sure we have PATH set correctly for any spawned children
 		paths := strings.Split(os.Getenv("PATH"), ";")
@ -64,7 +69,7 @@ func init() {
 		}

 		// Make sure our logging dir exists
-		_, err := os.Stat(AppDataDir)
+		_, err = os.Stat(AppDataDir)
 		if errors.Is(err, os.ErrNotExist) {
 			if err := os.MkdirAll(AppDataDir, 0o755); err != nil {
 				slog.Error(fmt.Sprintf("create ollama dir %s: %v", AppDataDir, err))
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@ -18,11 +18,17 @@ func getCLIFullPath(command string) string {
 	var cmdPath string
 	appExe, err := os.Executable()
 	if err == nil {
+		// Check both the same location as the tray app, as well as ./bin
 		cmdPath = filepath.Join(filepath.Dir(appExe), command)
 		_, err := os.Stat(cmdPath)
 		if err == nil {
 			return cmdPath
 		}
+		cmdPath = filepath.Join(filepath.Dir(appExe), "bin", command)
+		_, err = os.Stat(cmdPath)
+		if err == nil {
+			return cmdPath
+		}
 	}
 	cmdPath, err = exec.LookPath(command)
 	if err == nil {
--- a/app/lifecycle/updater_windows.go
+++ b/app/lifecycle/updater_windows.go
@ -26,19 +26,15 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
 	slog.Info("starting upgrade with " + installerExe)
 	slog.Info("upgrade log file " + UpgradeLogFile)

-	// When running in debug mode, we'll be "verbose" and let the installer pop up and prompt
+	// make the upgrade show progress, but non interactive
 	installArgs := []string{
 		"/CLOSEAPPLICATIONS",                    // Quit the tray app if it's still running
 		"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
 		"/FORCECLOSEAPPLICATIONS",               // Force close the tray app - might be needed
-	}
-	// make the upgrade as quiet as possible (no GUI, no prompts)
-	installArgs = append(installArgs,
 		"/SP",                                   // Skip the "This will install... Do you wish to continue" prompt
-		"/SUPPRESSMSGBOXES",
+		"/NOCANCEL",                             // Disable the ability to cancel upgrade mid-flight to avoid partially installed upgrades
 		"/SILENT",
-		"/VERYSILENT",
-	)
+	}

 	// Safeguard in case we have requests in flight that need to drain...
 	slog.Info("Waiting for server to shutdown")
--- a/app/ollama.iss
+++ b/app/ollama.iss
@ -53,8 +53,8 @@ RestartIfNeededByRun=no
 ; https://jrsoftware.org/ishelp/index.php?topic=setup_wizardimagefile
 WizardSmallImageFile=.\assets\setup.bmp

-; TODO verifty actual min windows version...
-; OG Win 10
+; Ollama requires Windows 10 22H2 or newer for proper unicode rendering
+; TODO: consider setting this to 10.0.19045
 MinVersion=10.0.10240

 ; First release that supports WinRT UI Composition for win32 apps
@ -136,7 +136,7 @@ Type: filesandordirs; Name: "{%TEMP}\ollama*"
 Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"

 [Messages]
-WizardReady=Ollama Windows Preview
+WizardReady=Ollama
 ReadyLabel1=%nLet's get you up and running with your own large language models.
 SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or finish the other installer, then click OK to continue with this install, or Cancel to exit.

--- a/app/tray/wintray/menus.go
+++ b/app/tray/wintray/menus.go
@ -11,12 +11,13 @@ import (
 )

 const (
-	updateAvailableMenuID = 1
-	updateMenuID          = updateAvailableMenuID + 1
-	separatorMenuID       = updateMenuID + 1
-	diagLogsMenuID        = separatorMenuID + 1
-	diagSeparatorMenuID   = diagLogsMenuID + 1
-	quitMenuID            = diagSeparatorMenuID + 1
+	_ = iota
+	updateAvailableMenuID
+	updateMenuID
+	separatorMenuID
+	diagLogsMenuID
+	diagSeparatorMenuID
+	quitMenuID
 )

 func (t *winTray) initMenus() error {
--- a/discover/gpu.go
+++ b/discover/gpu.go
@ -316,7 +316,9 @@ func GetGPUInfo() GpuInfoList {
 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
 				if cHandles.nvml != nil {
-					C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used)
+					uuid := C.CString(gpuInfo.ID)
+					defer C.free(unsafe.Pointer(uuid))
+					C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
 					if memInfo.err != nil {
 						slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
 						C.free(unsafe.Pointer(memInfo.err))
@ -417,7 +419,9 @@ func GetGPUInfo() GpuInfoList {
 		}
 		for i, gpu := range cudaGPUs {
 			if cHandles.nvml != nil {
-				C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
+				uuid := C.CString(gpu.ID)
+				defer C.free(unsafe.Pointer(uuid))
+				C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
 			} else if cHandles.cudart != nil {
 				C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
 			} else if cHandles.nvcuda != nil {
--- a/discover/gpu_info_nvcuda.c
+++ b/discover/gpu_info_nvcuda.c
@ -4,6 +4,7 @@
 #include "gpu_info_nvcuda.h"

 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
+  LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
  CUresult ret;
  resp->err = NULL;
  resp->num_devices = 0;
@ -57,8 +58,10 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
      resp->cudaErr = -1;
      return;
    }
+    LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
  }

+  LOG(resp->ch.verbose, "calling cuInit\n");
  ret = (*resp->ch.cuInit)(0);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
@ -75,15 +78,18 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  resp->ch.driver_minor = 0;

  // Report driver version if we're in verbose mode, ignore errors
+  LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
  ret = (*resp->ch.cuDriverGetVersion)(&version);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  } else {
+    LOG(resp->ch.verbose, "raw version 0x%x\n", version);
    resp->ch.driver_major = version / 1000;
    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
  }

+  LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
@ -94,6 +100,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
    resp->cudaErr = ret;
    return;
  }
+  LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
 }

 const int buflen = 256;
--- a/discover/gpu_info_nvml.c
+++ b/discover/gpu_info_nvml.c
@ -17,7 +17,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  } l[] = {
      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
-      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
+      {"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
      {NULL, NULL},
  };
@ -67,20 +67,20 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
 }


-void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
+void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) {
    nvmlDevice_t device;
    nvmlMemory_t memInfo = {0};
    nvmlReturn_t ret;
-    ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
+    ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device);
    if (ret != NVML_SUCCESS) {
-        LOG(1, "unable to get device handle %d: %d", device_id, ret);
+        LOG(1, "unable to get device handle %s: %d", uuid, ret);
        *free = 0;
        return;
    }

    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
    if (ret != NVML_SUCCESS) {
-        LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
+        LOG(1, "device memory info lookup failure %s: %d", uuid, ret);
        *free = 0;
        return;
    }
--- a/discover/gpu_info_nvml.h
+++ b/discover/gpu_info_nvml.h
@ -25,7 +25,7 @@ typedef struct nvml_handle {
  uint16_t verbose;
  nvmlReturn_t (*nvmlInit_v2)(void);
  nvmlReturn_t (*nvmlShutdown)(void);
-  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
 } nvml_handle_t;

@ -41,7 +41,7 @@ typedef struct nvml_compute_capability {
 } nvml_compute_capability_t;

 void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
-void nvml_get_free(nvml_handle_t ch,  int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
+void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
 void nvml_release(nvml_handle_t ch);

 #endif  // __GPU_INFO_NVML_H__
--- a/discover/gpu_linux.go
+++ b/discover/gpu_linux.go
@ -3,9 +3,11 @@ package discover
 import (
 	"bufio"
 	"fmt"
+	"io"
 	"os"
 	"reflect"
 	"regexp"
+	"sort"
 	"strings"

 	"github.com/ollama/ollama/format"
@ -109,6 +111,10 @@ func GetCPUDetails() ([]CPU, error) {
 	if err != nil {
 		return nil, err
 	}
+	return linuxCPUDetails(file)
+}
+
+func linuxCPUDetails(file io.Reader) ([]CPU, error) {
 	reColumns := regexp.MustCompile("\t+: ")
 	scanner := bufio.NewScanner(file)
 	cpuInfos := []linuxCpuInfo{}
@ -131,6 +137,9 @@ func GetCPUDetails() ([]CPU, error) {
 			cpu = &linuxCpuInfo{}
 		}
 	}
+	if cpu.ID != "" {
+		cpuInfos = append(cpuInfos, *cpu)
+	}

 	// Process the sockets/cores/threads
 	socketByID := map[string]*CPU{}
@ -177,10 +186,14 @@ func GetCPUDetails() ([]CPU, error) {
 			s.EfficiencyCoreCount = efficiencyCoreCount
 		}
 	}
-
-	result := []CPU{}
-	for _, c := range socketByID {
-		result = append(result, *c)
+	keys := make([]string, 0, len(socketByID))
+	result := make([]CPU, 0, len(socketByID))
+	for k := range socketByID {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	for _, k := range keys {
+		result = append(result, *socketByID[k])
 	}
 	return result, nil
 }
--- a/discover/gpu_linux_test.go
+++ b/discover/gpu_linux_test.go
--- a/discover/types.go
+++ b/discover/types.go
@ -175,6 +175,11 @@ func (si SystemInfo) GetOptimalThreadCount() int {
 	if len(si.System.CPUs) == 0 {
 		return 0
 	}
-	// Allocate thread count matching the performance cores on a single socket
-	return si.System.CPUs[0].CoreCount - si.System.CPUs[0].EfficiencyCoreCount
+
+	coreCount := 0
+	for _, c := range si.System.CPUs {
+		coreCount += c.CoreCount - c.EfficiencyCoreCount
+	}
+
+	return coreCount
 }
--- a/docs/api.md
+++ b/docs/api.md
@ -355,7 +355,6 @@ curl http://localhost:11434/api/generate -d '{
    "num_gpu": 1,
    "main_gpu": 0,
    "low_vram": false,
-    "f16_kv": true,
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
--- a/docs/development.md
+++ b/docs/development.md
@ -1,183 +1,5 @@
 # Development

-> [!IMPORTANT]
-> The `llm` package that loads and runs models is being updated to use a new [Go runner](#transition-to-go-runner): this should only impact a small set of PRs however it does change how the project is built.
-
-Install required tools:
-
- cmake version 3.24 or higher
- go version 1.22 or higher
- gcc version 11.4.0 or higher
-
-### MacOS
-
-```bash
-brew install go cmake gcc
-```
-
-Optionally enable debugging and more verbose logging:
-
-```bash
-# At build time
-export CGO_CFLAGS="-g"
-
-# At runtime
-export OLLAMA_DEBUG=1
-```
-
-Get the required libraries and build the native LLM code:
-
-```bash
-go generate ./...
-```
-
-Then build ollama:
-
-```bash
-go build .
-```
-
-Now you can run `ollama`:
-
-```bash
-./ollama
-```
-
-### Linux
-
-#### Linux CUDA (NVIDIA)
-
-_Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
-
-Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
-development and runtime packages.
-
-Typically the build scripts will auto-detect CUDA, however, if your Linux distro
-or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
-libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
-a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
-
-Then generate dependencies:
-
-```
-go generate ./...
-```
-
-Then build the binary:
-
-```
-go build .
-```
-
-#### Linux ROCm (AMD)
-
-_Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
-
-Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `cmake` and `golang`.
-
-Typically the build scripts will auto-detect ROCm, however, if your Linux distro
-or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `ROCM_PATH` to the location of the ROCm
-install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
-CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
-the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
-
-```
-go generate ./...
-```
-
-Then build the binary:
-
-```
-go build .
-```
-
-ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
-
-#### Advanced CPU Settings
-
-By default, running `go generate ./...` will compile a few different variations
-of the LLM library based on common CPU families and vector math capabilities,
-including a lowest-common-denominator which should run on almost any 64 bit CPU
-somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
-load. If you would like to build a CPU-based build customized for your
-processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
-like to use. For example, to compile an optimized binary for an Intel i9-9880H,
-you might use:
-
-```
-OLLAMA_CUSTOM_CPU_DEFS="-DGGML_AVX=on -DGGML_AVX2=on -DGGML_F16C=on -DGGML_FMA=on" go generate ./...
-go build .
-```
-
-#### Containerized Linux Build
-
-If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
-
-### Windows
-
-Note: The Windows build for Ollama is still under development.
-
-First, install required tools:
-
- MSVC toolchain - C/C++ and cmake as minimal requirements
- Go version 1.22 or higher
- MinGW (pick one variant) with GCC.
-  - [MinGW-w64](https://www.mingw-w64.org/)
-  - [MSYS2](https://www.msys2.org/)
- The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
-
-Then, build the `ollama` binary:
-
-```powershell
-$env:CGO_ENABLED="1"
-go generate ./...
-go build .
-```
-
-#### Windows CUDA (NVIDIA)
-
-In addition to the common Windows development tools described above, install CUDA after installing MSVC.
-
- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
-
-
-#### Windows ROCm (AMD Radeon)
-
-In addition to the common Windows development tools described above, install AMDs HIP package after installing MSVC.
-
- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
- [Strawberry Perl](https://strawberryperl.com/)
-
-Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
-
-#### Windows arm64
-
-The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want.  To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
-
-```powershell
-import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
-Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
-```
-
-You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
-
-Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment.  Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
-
-```
-pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
-```
-
-You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
-
-
-## Transition to Go runner
-
-The Ollama team is working on moving to a new Go based runner that loads and runs models in a subprocess to replace the previous code under `ext_server`. During this transition period, this new Go runner is "opt in" at build time, and requires using a different approach to build.
-
-After the transition to use the Go server exclusively, both `make` and `go generate` will build the Go runner.
-
 Install required tools:

 - go version 1.22 or higher
@ -201,7 +23,7 @@ export OLLAMA_DEBUG=1
 Get the required libraries and build the native LLM code:  (Adjust the job count based on your number of processors for a faster build)

 ```bash
-make -C llama -j 5
+make -j 5
 ```

 Then build ollama:
@ -238,7 +60,7 @@ a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "
 Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)

 ```
-make -C llama -j 5
+make -j 5
 ```

 Then build the binary:
@ -263,7 +85,7 @@ the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx
 Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)

 ```
-make -C llama -j 5
+make -j 5
 ```

 Then build the binary:
@ -286,7 +108,7 @@ Custom CPU settings are not currently supported in the new Go server build but w

 #### Containerized Linux Build

-If you have Docker available, you can build linux binaries with `OLLAMA_NEW_RUNNERS=1 ./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
+If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`

 ### Windows

@ -296,16 +118,19 @@ The following tools are required as a minimal development environment to build C
  - https://go.dev/dl/
 - Git
  - https://git-scm.com/download/win
- GCC and Make.  There are multiple options on how to go about installing these tools on Windows.  We have verified the following, but others may work as well:  
+- clang with gcc compat and Make.  There are multiple options on how to go about installing these tools on Windows.  We have verified the following, but others may work as well:  
  - [MSYS2](https://www.msys2.org/)
-    - After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-ucrt-x86_64-gcc make` to install the required tools
-  - Assuming you used the default install prefix for msys2 above, add `c:\msys64\ucrt64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)
+    - After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-clang-x86_64-gcc-compat mingw-w64-clang-x86_64-clang make` to install the required tools
+  - Assuming you used the default install prefix for msys2 above, add `C:\msys64\clang64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)
+
+> [!NOTE]  
+> Due to bugs in the GCC C++ library for unicode support, Ollama should be built with clang on windows.

 Then, build the `ollama` binary:

 ```powershell
 $env:CGO_ENABLED="1"
-make -C llama -j 8
+make -j 8
 go build .
 ```

--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@ -10,7 +10,7 @@ This sounds like a typical censored response, but even llama2-uncensored gives a

 So let's figure out how we can use **LangChain** with Ollama to ask our question to the actual document, the Odyssey by Homer, using Python.

-Let's start by asking a simple question that we can get an answer to from the **Llama2** model using **Ollama**. First, we need to install the **LangChain** package:
+Let's start by asking a simple question that we can get an answer to from the **Llama3** model using **Ollama**. First, we need to install the **LangChain** package:

 `pip install langchain_community`

--- a/docs/windows.md
+++ b/docs/windows.md
@ -1,22 +1,15 @@
-# Ollama Windows Preview
+# Ollama Windows

-Welcome to the Ollama Windows preview.
+Welcome to Ollama for Windows.

 No more WSL required!

 Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
-After installing Ollama Windows Preview, Ollama will run in the background and
+After installing Ollama for Windows, Ollama will run in the background and
 the `ollama` command line is available in `cmd`, `powershell` or your favorite
 terminal application. As usual the Ollama [api](./api.md) will be served on
 `http://localhost:11434`.

-As this is a preview release, you should expect a few bugs here and there.  If
-you run into a problem you can reach out on
-[Discord](https://discord.gg/ollama), or file an
-[issue](https://github.com/ollama/ollama/issues).
-Logs will often be helpful in diagnosing the problem (see
-[Troubleshooting](#troubleshooting) below)
-
 ## System Requirements

 * Windows 10 22H2 or newer, Home or Pro
@ -25,6 +18,32 @@ Logs will often be helpful in diagnosing the problem (see

 Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.

+## Filesystem Requirements
+
+The Ollama install does not require Administrator, and installs in your home directory by default.  You'll need at least 4GB of space for the binary install.  Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size.  If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
+
+### Changing Install Location
+
+To install the Ollama application in a location different than your home directory, start the installer with the following flag
+
+```powershell
+OllamaSetup.exe /DIR="d:\some\location"
+```
+
+### Changing Model Location
+
+To change where Ollama stores the downloaded models instead of using your home directory, set the environment variable `OLLAMA_MODELS` in your user account.
+
+1. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
+
+2. Click on _Edit environment variables for your account_.
+
+3. Edit or create a new variable for your user account for `OLLAMA_MODELS` where you want the models stored
+
+4. Click OK/Apply to save.
+
+If Ollama is already running, Quit the tray application and relaunch it from the Start menu, or a new terminal started after you saved the environment variables.
+
 ## API Access

 Here's a quick example showing API access from `powershell`
@ -34,10 +53,6 @@ Here's a quick example showing API access from `powershell`

 ## Troubleshooting

-While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
-a "view logs" menu item to the app, and increases logging for the GUI app and
-server.
-
 Ollama on Windows stores files in a few different locations.  You can view them in
 the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
@ -52,6 +67,10 @@ the explorer window by hitting `<cmd>+R` and type in:

 The Ollama Windows installer registers an Uninstaller application.  Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.

+> [!NOTE]
+> If you have [changed the OLLAMA_MODELS location](#changing-model-location), the installer will not remove your downloaded models
+
+
 ## Standalone CLI

 The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@ -31,7 +31,7 @@ func TestOrcaMiniBlueSky(t *testing.T) {
 }

 func TestUnicode(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
@ -42,9 +42,15 @@ func TestUnicode(t *testing.T) {
 		Options: map[string]interface{}{
 			"temperature": 0,
 			"seed":        123,
+			// Workaround deepseek context shifting bug
+			"num_ctx":     8192,
+			"num_predict": 2048,
 		},
 	}
-	GenerateTestHelper(ctx, t, req, []string{"散射", "频率"})
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	require.NoError(t, PullIfMissing(ctx, client, req.Model))
+	DoGenerate(ctx, t, client, req, []string{"散射", "频率"}, 120*time.Second, 120*time.Second)
 }

 func TestExtendedUnicodeOutput(t *testing.T) {
@ -60,7 +66,10 @@ func TestExtendedUnicodeOutput(t *testing.T) {
 			"seed":        123,
 		},
 	}
-	GenerateTestHelper(ctx, t, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"})
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	require.NoError(t, PullIfMissing(ctx, client, req.Model))
+	DoGenerate(ctx, t, client, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"}, 120*time.Second, 120*time.Second)
 }

 func TestUnicodeModelDir(t *testing.T) {
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@ -60,7 +60,8 @@ func TestMultiModelConcurrency(t *testing.T) {
 	for i := 0; i < len(req); i++ {
 		go func(i int) {
 			defer wg.Done()
-			DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
+			// Note: CPU based inference can crawl so don't give up too quickly
+			DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 30*time.Second)
 		}(i)
 	}
 	wg.Wait()
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@ -12,7 +12,7 @@ import (
 	"github.com/stretchr/testify/require"
 )

-func TestIntegrationMultimodal(t *testing.T) {
+func TestIntegrationLlava(t *testing.T) {
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
 	require.NoError(t, err)
 	req := api.GenerateRequest{
@ -39,6 +39,33 @@ func TestIntegrationMultimodal(t *testing.T) {
 	DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
 }

+func TestIntegrationMllama(t *testing.T) {
+	image, err := base64.StdEncoding.DecodeString(imageEncoding)
+	require.NoError(t, err)
+	req := api.GenerateRequest{
+		// TODO fix up once we publish the final image
+		Model:  "x/llama3.2-vision",
+		Prompt: "what does the text in this image say?",
+		Stream: &stream,
+		Options: map[string]interface{}{
+			"seed":        42,
+			"temperature": 0.0,
+		},
+		Images: []api.ImageData{
+			image,
+		},
+	}
+
+	resp := "the ollamas"
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	require.NoError(t, PullIfMissing(ctx, client, req.Model))
+	// mllama models on CPU can be quite slow to start,
+	DoGenerate(ctx, t, client, req, []string{resp}, 240*time.Second, 30*time.Second)
+}
+
 const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
 AAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAABIAAAAAQAAAEgAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAANKgAwAEAAAAAQAA
 AHgAAAAAXdsepgAAAAlwSFlzAAALEwAACxMBAJqcGAAAAVlpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6
--- a/llama/Dockerfile
+++ b/llama/Dockerfile
@ -1,221 +0,0 @@
-# Note: once we have fully transitioned to the Go server, this will replace the old Dockerfile at the top of the tree
-ARG GOLANG_VERSION=1.22.8
-ARG CMAKE_VERSION=3.22.1
-ARG CUDA_VERSION_11=11.3.1
-ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
-ARG CUDA_VERSION_12=12.4.0
-ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
-ARG ROCM_VERSION=6.1.2
-
-### To create a local image for building linux binaries on mac or windows with efficient incremental builds
-#
-# docker build --platform linux/amd64 -t builder-amd64 -f llama/Dockerfile --target unified-builder-amd64 .
-# docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
-#
-### Then incremental builds will be much faster in this container
-#
-# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
-#
-FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-ARG CUDA_VERSION_11
-ARG CUDA_VERSION_12
-COPY ./scripts/rh_linux_deps.sh /
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
-    dnf clean all && \
-    dnf install -y \
-    zsh \
-    cuda-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
-# TODO intel oneapi goes here...
-ENV GOARCH amd64
-ENV CGO_ENABLED 1
-WORKDIR /go/src/github.com/ollama/ollama/
-ENTRYPOINT [ "zsh" ]
-
-### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
-# Note: this does not contain jetson variants
-#
-# docker build --platform linux/arm64 -t builder-arm64 -f llama/Dockerfile --target unified-builder-arm64 .
-# docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
-#
-FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-ARG CUDA_VERSION_11
-ARG CUDA_VERSION_12
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
-    dnf config-manager --set-enabled appstream && \
-    dnf clean all && \
-    dnf install -y \
-    zsh \
-    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
-ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-ENV GOARCH amd64
-ENV CGO_ENABLED 1
-WORKDIR /go/src/github.com/ollama/ollama/
-ENTRYPOINT [ "zsh" ]
-
-FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
-ARG OLLAMA_SKIP_ROCM_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
-ARG OLLAMA_FAST_BUILD
-RUN --mount=type=cache,target=/root/.ccache \
-    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -C llama -j $(expr $(nproc) / 2 ) ; \
-    else \
-        make -C llama -j 5 ; \
-    fi
-
-FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
-ARG OLLAMA_FAST_BUILD
-RUN --mount=type=cache,target=/root/.ccache \
-    make -C llama -j 8
-
-
-# Intermediate stages used for ./scripts/build_linux.sh
-FROM --platform=linux/amd64 centos:7 AS builder-amd64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-ENV CGO_ENABLED 1
-ENV GOARCH amd64
-WORKDIR /go/src/github.com/ollama/ollama
-
-FROM --platform=linux/amd64 builder-amd64 AS build-amd64
-COPY . .
-COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/build/ build/
-ARG GOFLAGS
-ARG CGO_CFLAGS
-ARG OLLAMA_SKIP_ROCM_GENERATE
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/bin/ollama .
-RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
-    cd dist/linux-$GOARCH-rocm && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
-    fi
-
-FROM --platform=linux/arm64 rockylinux:8 AS builder-arm64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-ENV CGO_ENABLED 1
-ENV GOARCH arm64
-WORKDIR /go/src/github.com/ollama/ollama
-
-FROM --platform=linux/arm64 builder-arm64 AS build-arm64
-COPY . .
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
-RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-
-FROM --platform=linux/amd64 scratch AS dist-amd64
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM --platform=linux/arm64 scratch AS dist-arm64
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM dist-$TARGETARCH AS dist
-
-
-# Optimized container images do not cary nested payloads
-FROM --platform=linux/amd64 builder-amd64 AS container-build-amd64
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/bin/ollama .
-
-FROM --platform=linux/arm64 builder-arm64 AS container-build-arm64
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
-
-# For amd64 container images, filter out cuda/rocm to minimize size
-FROM runners-amd64 AS runners-cuda-amd64
-RUN rm -rf \
-    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
-    ./dist/linux-amd64/lib/ollama/runners/rocm*
-
-FROM runners-amd64 AS runners-rocm-amd64
-RUN rm -rf \
-    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
-    ./dist/linux-amd64/lib/ollama/libcu*.so* \
-    ./dist/linux-amd64/lib/ollama/runners/cuda*
-
-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-
-FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-
-# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
-# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
-# across releases
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-
-EXPOSE 11434
-ENV OLLAMA_HOST 0.0.0.0
-
-ENTRYPOINT ["/bin/ollama"]
-CMD ["serve"]
-
-FROM runtime-$TARGETARCH
-EXPOSE 11434
-ENV OLLAMA_HOST 0.0.0.0
-ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
-ENV NVIDIA_VISIBLE_DEVICES=all
-
-ENTRYPOINT ["/bin/ollama"]
-CMD ["serve"]
--- a/llama/README.md
+++ b/llama/README.md
@ -95,31 +95,17 @@ make -j

 Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model.  While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit.  A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.

-> [!IMPORTANT]
-> Prior to merging #7157 we continue to leverage a submodule for llama.cpp which establishes the tracking commit.  After merging that PR a new manifest file we be utilized
-
 If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.

 ```
-make -C llama apply-patches
+make apply-patches
 ```

 ### Updating Base Commit

 **Pin to new base commit**

-To update to a newer base commit, select the upstream git tag or commit
-
-> [!IMPORTANT]
-> After merging #7157 a manifest will be used instead of the submodule
-
-```
-cd llm/llama.cpp
-git fetch
-git checkout NEW_BASE_COMMIT
-cd ..
-git add llama.cpp
-```
+To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring.env`

 #### Applying patches

@ -128,13 +114,13 @@ When updating to a newer base commit, the existing patches may not apply cleanly
 Start by applying the patches.  If any of the patches have conflicts, the `git am` will stop at the first failure.

 ```
-make -C llama apply-patches
+make apply-patches
 ```

 If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed.  Save the file(s) and continue the patch series with `git am --continue` .  If any additional patches fail, follow the same pattern until the full patch series is applied.  Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.

 ```
-make -C llama create-patches sync
+make create-patches sync
 ```

 Build and test Ollama, and make any necessary changes to the Go code based on the new base commit.  Submit your PR to the Ollama repo.
@ -144,14 +130,14 @@ Build and test Ollama, and make any necessary changes to the Go code based on th
 When working on new fixes or features that impact vendored code, use the following model.  First get a clean tracking repo with all current patches applied:

 ```
-make -C llama apply-patches
+make apply-patches
 ```

 Now edit the upstream native code in the `./vendor/` directory.  You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing.  Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:

 ```
-make -C llama sync
-make -C llama -j 8
+make sync
+make -j 8
 go build .
 ```

@ -161,7 +147,7 @@ go build .
 Iterate until you're ready to submit PRs.  Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with

 ```
-make -C llama create-patches
+make create-patches
 ```

 > [!IMPORTANT]
--- a/llama/llama.cpp
+++ b/llama/llama.cpp
@ -2699,7 +2699,7 @@ struct llama_hparams {
        GGML_ABORT("fatal error");
    }

-    bool cross_attention_layer(uint32_t il) const {
+    bool cross_attention_layers(uint32_t il) const {
        return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
    }
 };
@ -2731,6 +2731,9 @@ struct llama_cparams {
    bool offload_kqv;
    bool flash_attn;
    bool no_perf;
+    // TODO (jmorganca): this should most likely be passed in as part of a batch
+    // and not set on the context for all batches.
+    bool cross_attn = false;

    enum llama_pooling_type pooling_type;

@ -3542,10 +3545,6 @@ struct llama_context {
    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]

-    // TODO (jmorganca): this should most likely be passed in as part of a batch
-    // and not set on the context for all batches.
-    float * cross_attn_state = nullptr;
-    bool cross_attn_state_first_pass = true;
    struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
 };

@ -3782,7 +3781,7 @@ static bool llama_kv_cache_init(

    for (int i = 0; i < (int) n_layer; i++) {
        // for cross attention layers
-        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layer(i)) {
+        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
            struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
            ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
            ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
@ -7389,7 +7388,7 @@ static bool llm_load_tensors(

                        auto & layer = model.layers[i];

-                        if (hparams.cross_attention_layer(i)) {
+                        if (hparams.cross_attention_layers(i)) {
                            layer.cross_attn_k_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128});
                            layer.cross_attn_k_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024});
                            layer.cross_attn_o_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd});
@ -9368,11 +9367,10 @@ static struct ggml_tensor * llm_build_inp_cross_attn_state(
         const llm_build_cb & cb) {
    const int64_t n_embd = hparams.n_embd;

-    struct ggml_tensor * inpCAS;
-    lctx.inp_cross_attn_state = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
-    cb(lctx.inp_cross_attn_state, "inp_cross_attn_state", -1);
-    ggml_set_input(lctx.inp_cross_attn_state);
-    inpCAS = lctx.inp_cross_attn_state;
+    struct ggml_tensor * inpCAS = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
+    cb(inpCAS, "inp_cross_attn_state", -1);
+    ggml_set_input(inpCAS);
+    lctx.inp_cross_attn_state = inpCAS;

    return inpCAS;
 }
@ -10979,8 +10977,8 @@ struct llm_build_context {
                    LLM_NORM_RMS, cb, il);
            cb(cur, "attn_norm", il);

-            if (hparams.cross_attention_layer(il)) {
-                if (!lctx.cross_attn_state) {
+            if (hparams.cross_attention_layers(il)) {
+                if (!batch.embd && !cparams.cross_attn) {
                    continue;
                }

@ -10991,42 +10989,28 @@ struct llm_build_context {
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
                cb(Qcur, "Qcur", il);

-                Qcur = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-                cb(Qcur, "Qcur", il);
-
-                // TODO: is this required?
-                Qcur = ggml_cont(ctx0, Qcur);
+                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
                cb(Qcur, "Qcur", il);

                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
                cb(Qcur, "Qcur", il);

-                struct ggml_tensor * Kcur;
-                if (lctx.cross_attn_state_first_pass) {
+                struct ggml_tensor * Kcur, * Vcur;
+                if (batch.embd) {
                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
                    cb(Kcur, "Kcur", il);

                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
                    cb(Kcur, "Kcur", il);

-                    Kcur = ggml_permute(ctx0, Kcur, 0, 2, 1, 3);
-                    cb(Kcur, "Kcur", il);
-
-                    // TODO: is this required?
-                    Kcur = ggml_cont(ctx0, Kcur);
+                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
                    cb(Kcur, "Kcur", il);

                    Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
                    cb(Kcur, "Kcur", il);

                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self.k_l[il]));
-                } else {
-                    Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
-                    cb(Kcur, "Kcur (view)", il);
-                }

-                struct ggml_tensor * Vcur;
-                if (lctx.cross_attn_state_first_pass) {
                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
                    cb(Vcur, "Vcur", il);

@ -11038,6 +11022,9 @@ struct llm_build_context {

                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self.v_l[il]));
                } else {
+                    Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
+                    cb(Kcur, "Kcur (view)", il);
+
                    Vcur = ggml_view_tensor(ctx0, kv_self.v_l[il]);
                    cb(Vcur, "Vcur (view)", il);
                }
@ -11045,11 +11032,8 @@ struct llm_build_context {
                struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
                cb(kq, "kq", il);

-                kq = ggml_scale_inplace(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
-                cb(kq, "kq_scaled", il);
-
                // TODO: apply causal masks
-                struct ggml_tensor * kq_soft_max = ggml_soft_max_inplace(ctx0, kq);
+                struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
                cb(kq_soft_max, "kq_soft_max", il);

                Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
@ -17197,11 +17181,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
    }

    if (batch.embd) {
+        if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
+            ggml_backend_tensor_set(lctx.inp_cross_attn_state, batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
+            // zero out inp_embd since it's not used
+            float * inp_embd_data = (float *)lctx.inp_embd->data;
+            for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
+                inp_embd_data[i] = 0.0f;
+            }
+        } else {
            const int64_t n_embd   = hparams.n_embd;
            const int64_t n_tokens = batch.n_tokens;

            ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
        }
+    }

    if (batch.pos && lctx.inp_pos) {
        const int64_t n_tokens = batch.n_tokens;
@ -17209,14 +17202,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
        ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
    }

-    // TODO (jmorganca): this might copy a lot of data on every request of a
-    // single generation even though it doesn't change, so we should
-    // find a way to not set this more than one time per image
-    if (lctx.inp_cross_attn_state &&
-        lctx.inp_cross_attn_state->buffer) {
-        ggml_backend_tensor_set(lctx.inp_cross_attn_state, lctx.cross_attn_state, 0, hparams.n_embd * 1601 * 4 * ggml_element_size(lctx.inp_cross_attn_state));
-    }
-
    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
        GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
        const int64_t n_tokens = batch.n_tokens;
@ -17789,7 +17774,7 @@ static int llama_decode_internal(
        n_outputs = 1;
    }

-    lctx.sbatch.from_batch(batch_all, n_embd,
+    lctx.sbatch.from_batch(batch_all, batch_all.n_embd,
        /* simple_split */ !kv_self.recurrent,
        /* logits_all   */ n_outputs == n_tokens_all);

@ -17899,10 +17884,6 @@ static int llama_decode_internal(

        llama_set_inputs(lctx, ubatch);

-        // TODO: replace with something better to find out if its
-        // our first actual pass
-        lctx.cross_attn_state_first_pass = false;
-
        llama_graph_compute(lctx, gf, n_threads, threadpool);

        // update the kv ring buffer
@ -18086,7 +18067,7 @@ static int llama_encode_internal(

    const int64_t n_embd = hparams.n_embd;

-    lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+    lctx.sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);

    const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);

@ -20194,11 +20175,6 @@ struct llama_context * llama_new_context_with_model(
    return ctx;
 }

-void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state) {
-    ctx->cross_attn_state_first_pass = true;
-    ctx->cross_attn_state = cross_attn_state;
-}
-
 void llama_free(struct llama_context * ctx) {
    delete ctx;
 }
@ -21686,6 +21662,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
    ctx->cparams.causal_attn = causal_attn;
 }

+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
+    ctx->cparams.cross_attn = cross_attention;
+}
+
 struct llama_batch llama_batch_get_one(
             llama_token * tokens,
                 int32_t   n_tokens,
@ -21695,6 +21675,7 @@ struct llama_batch llama_batch_get_one(
        /*n_tokens       =*/ n_tokens,
        /*tokens         =*/ tokens,
        /*embd           =*/ nullptr,
+        /*n_embd         =*/ 0,
        /*pos            =*/ nullptr,
        /*n_seq_id       =*/ nullptr,
        /*seq_id         =*/ nullptr,
@ -21710,6 +21691,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
        /*n_tokens       =*/ 0,
        /*tokens         =*/ nullptr,
        /*embd           =*/ nullptr,
+        /*n_embd         =*/ 0,
        /*pos            =*/ nullptr,
        /*n_seq_id       =*/ nullptr,
        /*seq_id         =*/ nullptr,
@ -21721,6 +21703,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_

    if (embd) {
        batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
+        batch.n_embd = embd;
    } else {
        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
    }
--- a/llama/llama.go
+++ b/llama/llama.go
@ -1,5 +1,7 @@
 package llama

+//go:generate make -j 8
+
 /*
 #cgo CFLAGS: -O2 -std=c11 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE
 #cgo CXXFLAGS: -O2 -std=c++11 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE
@ -66,6 +68,17 @@ package llama
 #include "sampling_ext.h"

 bool llamaProgressCallback(float progress, void *user_data);
+
+typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
+COMPILER inline get_compiler() {
+#if defined(__clang__)
+	return COMP_CLANG;
+#elif defined(__GNUC__)
+	return COMP_GCC;
+#else
+	return UNKNOWN_COMPILER;
+#endif
+}
 */
 import "C"

@ -75,6 +88,7 @@ import (
 	"fmt"
 	"runtime"
 	"runtime/cgo"
+	"slices"
 	"strings"
 	"unsafe"
 )
@ -86,7 +100,38 @@ func BackendInit() {
 }

 func PrintSystemInfo() string {
-	return C.GoString(C.llama_print_system_info())
+	var compiler string
+	switch C.get_compiler() {
+	case C.COMP_UNKNOWN:
+		compiler = "cgo(unknown_compiler)"
+	case C.COMP_GCC:
+		compiler = "cgo(gcc)"
+	case C.COMP_CLANG:
+		compiler = "cgo(clang)"
+	}
+	return C.GoString(C.llama_print_system_info()) + compiler
+}
+
+func GetModelArch(modelPath string) (string, error) {
+	mp := C.CString(modelPath)
+	defer C.free(unsafe.Pointer(mp))
+
+	gguf_ctx := C.gguf_init_from_file(mp, C.struct_gguf_init_params{no_alloc: true, ctx: (**C.struct_ggml_context)(C.NULL)})
+	if gguf_ctx == nil {
+		return "", errors.New("unable to load model file")
+	}
+	defer C.gguf_free(gguf_ctx)
+
+	key := C.CString("general.architecture")
+	defer C.free(unsafe.Pointer(key))
+	arch_index := C.gguf_find_key(gguf_ctx, key)
+	if int(arch_index) < 0 {
+		return "", errors.New("unknown model architecture")
+	}
+
+	arch := C.gguf_get_val_str(gguf_ctx, arch_index)
+
+	return C.GoString(arch), nil
 }

 type ContextParams struct {
@ -216,7 +261,7 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 	}

 	m := Model{c: C.llama_load_model_from_file(C.CString(modelPath), cparams)}
-	if m.c == (*C.struct_llama_model)(C.NULL) {
+	if m.c == nil {
 		return nil, fmt.Errorf("unable to load model: %s", modelPath)
 	}

@ -232,7 +277,7 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) {
 		c:          C.llama_new_context_with_model(model.c, params.c),
 		numThreads: int(params.c.n_threads),
 	}
-	if c.c == (*C.struct_llama_context)(C.NULL) {
+	if c.c == nil {
 		return nil, errors.New("unable to create llama context")
 	}

@ -256,6 +301,9 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
 	defer C.free(unsafe.Pointer(cLoraPath))

 	loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath)
+	if loraAdapter == nil {
+		return errors.New("unable to load lora")
+	}

 	err := -1
 	if loraAdapter != nil {
@ -271,18 +319,40 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
 type Batch struct {
 	c         C.struct_llama_batch
 	batchSize int
+	maxSeq    int
 	embedSize int
 }

-// Creates a new batch for either word tokens if embed is 0 or
-// image embeddings if embed is specified. Batches cannot contain
-// both types at the same time
-func NewBatch(nTokens int, embed int, maxSeq int) *Batch {
-	return &Batch{
-		c:         C.llama_batch_init(C.int(nTokens), C.int(embed), C.int(maxSeq)),
-		batchSize: nTokens,
-		embedSize: embed,
+// Creates a new batch for either word tokens or image embeddings (if embedSize is non-zero).
+// Batches cannot contain both types at the same time. batchSize is the maximum number of entries
+// that can be added per sequence
+func NewBatch(batchSize int, maxSeq int, embedSize int) (*Batch, error) {
+	b := Batch{
+		c:         C.llama_batch_init(C.int(batchSize*maxSeq), C.int(embedSize), C.int(maxSeq)),
+		batchSize: batchSize,
+		maxSeq:    maxSeq,
+		embedSize: embedSize,
 	}
+
+	// Check to see if any of the allocations in llama_batch_init() failed
+	nilPointer := (embedSize == 0 && b.c.token == nil) || (embedSize != 0 && b.c.embd == nil) ||
+		b.c.pos == nil || b.c.n_seq_id == nil || b.c.seq_id == nil || b.c.logits == nil ||
+		slices.Contains(unsafe.Slice(b.c.seq_id, b.allocSize()), nil)
+
+	if nilPointer {
+		C.llama_batch_free(b.c)
+		return nil, fmt.Errorf("unable to allocate batch (batchSize=%v maxSeq=%v embedSize=%v)", batchSize, maxSeq, embedSize)
+	}
+
+	return &b, nil
+}
+
+func (b *Batch) Size() int {
+	return b.batchSize
+}
+
+func (b *Batch) allocSize() int {
+	return b.batchSize * b.maxSeq
 }

 func (b *Batch) NumTokens() int {
@ -297,21 +367,21 @@ func (b *Batch) IsEmbedding() bool {
 // when the batch was initialized. The other argument will be ignored. Adds to the
 // batch with the given position for the given sequence ids, and optionally instructs
 // to include logits.
-func (b *Batch) Add(token int, embed []float32, pos int, seqIds []int, logits bool) {
+func (b *Batch) Add(token int, embed []float32, pos int, logits bool, seqIds ...int) {
 	if !b.IsEmbedding() {
-		unsafe.Slice(b.c.token, b.batchSize)[b.c.n_tokens] = C.llama_token(token)
+		unsafe.Slice(b.c.token, b.allocSize())[b.c.n_tokens] = C.llama_token(token)
 	} else {
-		copy(unsafe.Slice((*float32)(b.c.embd), b.batchSize*b.embedSize)[int(b.c.n_tokens)*b.embedSize:], embed)
+		copy(unsafe.Slice((*float32)(b.c.embd), b.allocSize()*b.embedSize)[int(b.c.n_tokens)*b.embedSize:], embed)
 	}
-	unsafe.Slice(b.c.pos, b.batchSize)[b.c.n_tokens] = C.llama_pos(pos)
-	unsafe.Slice(b.c.n_seq_id, b.batchSize)[b.c.n_tokens] = C.int(len(seqIds))
+	unsafe.Slice(b.c.pos, b.allocSize())[b.c.n_tokens] = C.llama_pos(pos)
+	unsafe.Slice(b.c.n_seq_id, b.allocSize())[b.c.n_tokens] = C.int(len(seqIds))

 	for i, s := range seqIds {
-		unsafe.Slice((unsafe.Slice(b.c.seq_id, b.batchSize)[b.c.n_tokens]), C.int(len(seqIds)))[i] = C.int32_t(s)
+		unsafe.Slice((unsafe.Slice(b.c.seq_id, b.allocSize())[b.c.n_tokens]), C.int(len(seqIds)))[i] = C.int32_t(s)
 	}

 	if logits {
-		unsafe.Slice(b.c.logits, b.batchSize)[b.c.n_tokens] = 1
+		unsafe.Slice(b.c.logits, b.allocSize())[b.c.n_tokens] = 1
 	}

 	b.c.n_tokens += 1
@ -421,71 +491,42 @@ func Quantize(infile, outfile string, ftype uint32) error {
 	return nil
 }

-// llava
+// vision processing
 type ClipContext struct {
 	c *C.struct_clip_ctx
-	m        *C.struct_mllama_ctx
-	IsMllama bool
-	embedPin runtime.Pinner
-	pinned   bool
 }

-func getVisionArch(mp *C.char) (string, error) {
-	gguf_ctx := C.gguf_init_from_file(mp, C.struct_gguf_init_params{no_alloc: true, ctx: (**C.struct_ggml_context)(C.NULL)})
-	if gguf_ctx == nil {
-		return "", errors.New("unable to load vision projector")
-	}
-	defer C.gguf_free(gguf_ctx)
-
-	arch_index := C.gguf_find_key(gguf_ctx, C.CString("general.architecture"))
-	if int(arch_index) < 0 {
-		return "", errors.New("unknown vision model architecture")
-	}
-
-	arch := C.gguf_get_val_str(gguf_ctx, arch_index)
-
-	return C.GoString(arch), nil
-}
-
-func NewClipContext(modelPath string) (*ClipContext, error) {
+func NewClipContext(llamaContext *Context, modelPath string) (*ClipContext, error) {
 	mp := C.CString(modelPath)
 	defer C.free(unsafe.Pointer(mp))
-
-	arch, err := getVisionArch(mp)
-	if err != nil {
-		return nil, err
+	c := C.clip_model_load(mp, 1)
+	if c == nil {
+		return nil, fmt.Errorf("unable to load clip model: %v", modelPath)
 	}

-	var cc ClipContext
-	if arch == "clip" {
-		cc.c = C.clip_model_load(mp, 1)
-	} else if arch == "mllama" {
-		cc.m = C.mllama_model_load(mp, 1)
-		cc.IsMllama = true
-	} else {
-		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
+	projEmbedSize := int(C.clip_n_mmproj_embd(c))
+	modelEmbedSize := llamaContext.Model().NEmbd()
+	if projEmbedSize != modelEmbedSize {
+		return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
 	}

-	// XXX: check embedding size?
-	return &cc, nil
+	return &ClipContext{c: c}, nil
 }

 func (c *ClipContext) Free() {
-	if c.c != nil {
 	C.clip_free(c.c)
-	}
-	if c.m != nil {
-		C.mllama_free(c.m)
-	}
 }

-func NewLlavaImageEmbed(llamaContext *Context, clipContext *ClipContext, data []byte) [][]float32 {
-	c := C.llava_image_embed_make_with_bytes(clipContext.c, C.int(llamaContext.numThreads), (*C.uchar)(unsafe.Pointer(&data[0])), C.int(len(data)))
+func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, error) {
+	l := C.llava_image_embed_make_with_bytes(c.c, C.int(llamaContext.numThreads), (*C.uchar)(unsafe.Pointer(&data[0])), C.int(len(data)))
+	if l == nil {
+		return nil, errors.New("unable to make llava embedding from image")
+	}

-	numTokens := int(c.n_image_pos)
+	numTokens := int(l.n_image_pos)
 	numEmbed := llamaContext.Model().NEmbd()

-	s := unsafe.Slice((*float32)(c.embed), numEmbed*numTokens)
+	s := unsafe.Slice((*float32)(l.embed), numEmbed*numTokens)

 	embed := make([][]float32, numTokens)
 	rows := make([]float32, len(s))
@ -495,51 +536,66 @@ func NewLlavaImageEmbed(llamaContext *Context, clipContext *ClipContext, data []
 		embed[i] = rows[i*numEmbed : (i+1)*numEmbed]
 	}

-	C.llava_image_embed_free(c)
+	C.llava_image_embed_free(l)

-	return embed
+	return embed, nil
 }

-func NewMllamaImageEmbed(llamaContext *Context, clipContext *ClipContext, data []byte, aspectRatioId int) [][]float32 {
+type MllamaContext struct {
+	c *C.struct_mllama_ctx
+}
+
+func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
+	mp := C.CString(modelPath)
+	defer C.free(unsafe.Pointer(mp))
+	c := C.mllama_model_load(mp, 1)
+	if c == nil {
+		return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
+	}
+
+	projEmbedSize := int(C.mllama_n_embd(c))
+	modelEmbedSize := llamaContext.Model().NEmbd()
+	if projEmbedSize != modelEmbedSize {
+		return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
+	}
+
+	return &MllamaContext{c: c}, nil
+}
+
+func (m *MllamaContext) Free() {
+	C.mllama_free(m.c)
+}
+
+func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
 	img := C.mllama_image_init()
 	defer C.mllama_image_free(img)

-	C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img)
-
-	numTokens := int(C.mllama_n_positions(clipContext.m) * C.mllama_n_tiles(clipContext.m))
-	numEmbed := llamaContext.Model().NEmbd()
-
-	rows := make([]float32, numEmbed*numTokens)
-	C.mllama_image_encode(clipContext.m, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0])))
-
-	embed := make([][]float32, numTokens)
-	for i := range embed {
-		embed[i] = rows[i*numEmbed : (i+1)*numEmbed]
+	ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
+	if !ok {
+		return nil, errors.New("unable to load mllama image data")
 	}

-	return embed
+	rows := make([]float32, m.EmbedSize(llamaContext))
+	ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
+	if !ok {
+		return nil, errors.New("unable to make mllama embedding from image")
+	}
+
+	embed := make([][]float32, 1)
+	embed[0] = rows
+
+	return embed, nil
 }

-// This really needs to be set on a batch instead
-func MllamaSetCrossAttn(llamaContext *Context, clipContext *ClipContext, embed [][]float32) {
-	if embed != nil {
-		if clipContext.pinned {
-			panic("Cross attention state already pinned")
-		}
+func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
+	numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
+	numEmbed := llamaContext.Model().NEmbd()

-		embedData := &embed[0][0]
-		clipContext.embedPin.Pin(embedData)
-		clipContext.pinned = true
+	return numTokens * numEmbed
+}

-		C.llama_set_cross_attn_state(llamaContext.c, (*C.float)(unsafe.Pointer(embedData)))
-	} else {
-		C.llama_set_cross_attn_state(llamaContext.c, (*C.float)(C.NULL))
-
-		if clipContext.pinned {
-			clipContext.embedPin.Unpin()
-			clipContext.pinned = false
-		}
-	}
+func (c *Context) SetCrossAttention(state bool) {
+	C.llama_set_cross_attention(c.c, C.bool(state))
 }

 // sampling
@ -567,7 +623,7 @@ type SamplingParams struct {
 	Grammar        string
 }

-func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext {
+func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, error) {
 	var cparams C.struct_gpt_sampler_cparams
 	cparams.top_k = C.int32_t(params.TopK)
 	cparams.top_p = C.float(params.TopP)
@ -590,9 +646,13 @@ func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext {

 	cparams.grammar = grammar
 	context := &SamplingContext{c: C.gpt_sampler_cinit(model.c, &cparams)}
+	if context.c == nil {
+		return nil, errors.New("unable to create sampling context")
+	}
+
 	runtime.SetFinalizer(context, func(s *SamplingContext) { C.gpt_sampler_cfree(s.c) })

-	return context
+	return context, nil
 }

 func (s *SamplingContext) Reset() {
--- a/llama/llama.h
+++ b/llama/llama.h
@ -266,6 +266,7 @@ extern "C" {

        llama_token  *  token;
        float        *  embd;
+        int32_t         n_embd;
        llama_pos    *  pos;
        int32_t      *  n_seq_id;
        llama_seq_id ** seq_id;
@ -451,7 +452,7 @@ extern "C" {

    // TODO (jmorganca): this should most likely be passed in as part of a batch
    // and not set on the context for all batches.
-    LLAMA_API void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state);
+    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);

    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
--- a/llama/llava.cpp
+++ b/llama/llava.cpp
@ -435,7 +435,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), n_embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
            LOG_ERR("%s : failed to eval\n", __func__);
            return false;
--- a/llama/make/Makefile.rocm
+++ b/llama/make/Makefile.rocm
@ -58,6 +58,8 @@ endif
 GPU_COMPILER_CUFLAGS = \
 	$(GPU_COMPILER_FPIC) \
 	$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
+	-mf16c \
+	-mfma \
 	-parallel-jobs=2 \
 	-c \
 	-O3 \
@ -77,6 +79,9 @@ GPU_COMPILER_CUFLAGS = \
 	-D_CRT_SECURE_NO_WARNINGS \
 	-D_GNU_SOURCE \
 	-D_XOPEN_SOURCE=600 \
+	-DUSE_PROF_API=1 \
+	-std=gnu++14 \
+	-x hip \
 	-mllvm=-amdgpu-early-inline-all=true \
 	-mllvm=-amdgpu-function-calls=false \
 	-Wno-expansion-to-defined \
@ -87,6 +92,12 @@ GPU_COMPILER_CUFLAGS = \
 	-Wno-unused-result \
 	-I.

+# Workaround buggy P2P copy on some windows multi-GPU setups
+# This workaround breaks linux systems with small system RAM, so only enable on windows
+ifeq ($(OS),windows)
+	GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1
+endif
+
 include make/gpu.make

 # Adjust the rules from gpu.make to handle the ROCm dependencies properly
--- a/llama/make/Makefile.sync
+++ b/llama/make/Makefile.sync
@ -1,11 +1,12 @@
 # Helpers for managing our vendored llama.cpp repo and patch set

-# TODO - this should include a manifest file at the top of the tree 
-LLAMACPP_BASE_COMMIT=$(shell cd ../llm/llama.cpp && git rev-parse HEAD)
+REPO_ROOT:=$(dir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
+DST_DIR:=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))

-LLAMACPP_REPO := $(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))vendor/
+include $(REPO_ROOT)llama/vendoring
+
+LLAMACPP_REPO := $(REPO_ROOT)llama/vendor/

-DST_DIR=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
 LLAMACPP_PATCH_DIR := $(DST_DIR)patches/


--- a/llama/make/gpu.make
+++ b/llama/make/gpu.make
@ -85,7 +85,7 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS
 	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie  $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
 $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
 	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) --shared $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
+	$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@

 # Distribution targets
 $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
--- a/llama/patches/0010-add-mllama-support.patch
+++ b/llama/patches/0010-add-mllama-support.patch
@ -12,27 +12,49 @@ kv cache once per run

 remaining is to implement the cross attention mask
 ---
- include/llama.h |   4 +
- src/llama.cpp   | 456 ++++++++++++++++++++++++++++++++++++++++++++++--
- 2 files changed, 447 insertions(+), 13 deletions(-)
+ examples/llava/llava.cpp |   2 +-
+ include/llama.h          |   5 +
+ src/llama.cpp            | 447 +++++++++++++++++++++++++++++++++++++--
+ 3 files changed, 436 insertions(+), 18 deletions(-)

+diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
+index 8558c6bd..37b2f2e2 100644
+--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
+@@ -409,7 +409,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
+         if (n_eval > n_batch) {
+             n_eval = n_batch;
+         }
+-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), n_embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+         if (llama_decode(ctx_llama, batch)) {
+             LOG_ERR("%s : failed to eval\n", __func__);
+             return false;
 diff --git a/include/llama.h b/include/llama.h
-index 7cae1bbe..122e3cf1 100644
+index 7cae1bbe..aca09310 100644
 --- a/include/llama.h
 +++ b/include/llama.h
-@@ -423,6 +423,10 @@ extern "C" {
+@@ -240,6 +240,7 @@ extern "C" {
+ 
+         llama_token  *  token;
+         float        *  embd;
+        int32_t         n_embd;
+         llama_pos    *  pos;
+         int32_t      *  n_seq_id;
+         llama_seq_id ** seq_id;
+@@ -423,6 +424,10 @@ extern "C" {
                      struct llama_model * model,
             struct llama_context_params   params);
 
 +    // TODO (jmorganca): this should most likely be passed in as part of a batch
 +    // and not set on the context for all batches.
-+    LLAMA_API void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state);
+    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
 +
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
 
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 83b80b59..b189a19a 100644
+index 83b80b59..35748488 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -169,6 +169,7 @@ static std::string format(const char * fmt, ...) {
@ -160,13 +182,23 @@ index 83b80b59..b189a19a 100644
         GGML_ABORT("fatal error");
     }
 +
-+    bool cross_attention_layer(uint32_t il) const {
+    bool cross_attention_layers(uint32_t il) const {
 +        return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
 +    }
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2806,6 +2859,16 @@ struct llama_layer {
+@@ -2652,6 +2705,9 @@ struct llama_cparams {
+     bool offload_kqv;
+     bool flash_attn;
+     bool no_perf;
+    // TODO (jmorganca): this should most likely be passed in as part of a batch
+    // and not set on the context for all batches.
+    bool cross_attn = false;
+ 
+     enum llama_pooling_type pooling_type;
+ 
+@@ -2806,6 +2862,16 @@ struct llama_layer {
     struct ggml_tensor * ffn_down_scale;
 
     struct ggml_tensor * bskcn_tv;
@ -183,25 +215,21 @@ index 83b80b59..b189a19a 100644
 };
 
 // very similar to llama_batch,
-@@ -3452,6 +3515,12 @@ struct llama_context {
+@@ -3452,6 +3518,8 @@ struct llama_context {
     struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
 +
-+    // TODO (jmorganca): this should most likely be passed in as part of a batch
-+    // and not set on the context for all batches.
-+    float * cross_attn_state = nullptr;
-+    bool cross_attn_state_first_pass = true;
 +    struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
 };
 
 struct llama_lora_weight {
-@@ -3686,6 +3755,18 @@ static bool llama_kv_cache_init(
+@@ -3686,6 +3754,18 @@ static bool llama_kv_cache_init(
     cache.v_l.reserve(n_layer);
 
     for (int i = 0; i < (int) n_layer; i++) {
 +        // for cross attention layers
-+        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layer(i)) {
+        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
 +            struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
 +            ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
 +            ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
@ -215,7 +243,7 @@ index 83b80b59..b189a19a 100644
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
-@@ -5460,12 +5541,14 @@ static void llm_load_hparams(
+@@ -5460,12 +5540,14 @@ static void llm_load_hparams(
     }
 
     // zero-out the per-layer hparams
@ -235,7 +263,7 @@ index 83b80b59..b189a19a 100644
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -5514,7 +5597,7 @@ static void llm_load_hparams(
+@@ -5514,7 +5596,7 @@ static void llm_load_hparams(
 
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
@ -244,7 +272,7 @@ index 83b80b59..b189a19a 100644
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
-@@ -5554,6 +5637,16 @@ static void llm_load_hparams(
+@@ -5554,6 +5636,16 @@ static void llm_load_hparams(
                     }
                 }
             } break;
@ -261,7 +289,7 @@ index 83b80b59..b189a19a 100644
         case LLM_ARCH_MINICPM:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -7249,6 +7342,55 @@ static bool llm_load_tensors(
+@@ -7249,6 +7341,55 @@ static bool llm_load_tensors(
                         layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
                     }
                 } break;
@ -286,7 +314,7 @@ index 83b80b59..b189a19a 100644
 +
 +                        auto & layer = model.layers[i];
 +
-+                        if (hparams.cross_attention_layer(i)) {
+                        if (hparams.cross_attention_layers(i)) {
 +                            layer.cross_attn_k_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128});
 +                            layer.cross_attn_k_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024});
 +                            layer.cross_attn_o_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd});
@ -317,7 +345,7 @@ index 83b80b59..b189a19a 100644
             case LLM_ARCH_GROK:
                 {
                     if (n_expert == 0) {
-@@ -9093,7 +9235,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+@@ -9093,7 +9234,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 
         if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
             model.hparams.n_vocab != model.vocab.id_to_token.size()) {
@ -326,16 +354,7 @@ index 83b80b59..b189a19a 100644
         }
 
         if (params.vocab_only) {
-@@ -9178,7 +9320,7 @@ static struct ggml_tensor * llm_build_inp_embd(
- 
-         inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
-     } else {
-       lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
-+        lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
-         inpL = lctx.inp_embd;
-         ggml_set_input(lctx.inp_embd);
-     }
-@@ -9193,6 +9335,22 @@ static struct ggml_tensor * llm_build_inp_embd(
+@@ -9193,6 +9334,21 @@ static struct ggml_tensor * llm_build_inp_embd(
     return inpL;
 }
 
@ -346,11 +365,10 @@ index 83b80b59..b189a19a 100644
 +         const llm_build_cb & cb) {
 +    const int64_t n_embd = hparams.n_embd;
 +
-+    struct ggml_tensor * inpCAS;
-+    lctx.inp_cross_attn_state = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
-+    cb(lctx.inp_cross_attn_state, "inp_cross_attn_state", -1);
-+    ggml_set_input(lctx.inp_cross_attn_state);
-+    inpCAS = lctx.inp_cross_attn_state;
+    struct ggml_tensor * inpCAS = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
+    cb(inpCAS, "inp_cross_attn_state", -1);
+    ggml_set_input(inpCAS);
+    lctx.inp_cross_attn_state = inpCAS;
 +
 +    return inpCAS;
 +}
@ -358,7 +376,7 @@ index 83b80b59..b189a19a 100644
 static void llm_build_kv_store(
         struct ggml_context * ctx,
         const llama_hparams & hparams,
-@@ -10167,6 +10325,7 @@ struct llm_build_context {
+@@ -10167,6 +10323,7 @@ struct llm_build_context {
         lctx.inp_pos_bucket    = nullptr;
         lctx.inp_embd_enc      = nullptr;
         lctx.inp_KQ_mask_cross = nullptr;
@ -366,7 +384,7 @@ index 83b80b59..b189a19a 100644
     }
 
     void free() {
-@@ -10754,6 +10913,253 @@ struct llm_build_context {
+@@ -10754,6 +10911,239 @@ struct llm_build_context {
                 LLM_NORM_RMS, cb, -1);
         cb(cur, "result_norm", -1);
 
@ -410,8 +428,8 @@ index 83b80b59..b189a19a 100644
 +                    LLM_NORM_RMS, cb, il);
 +            cb(cur, "attn_norm", il);
 +
-+            if (hparams.cross_attention_layer(il)) {
-+                if (!lctx.cross_attn_state) {
+            if (hparams.cross_attention_layers(il)) {
+                if (!batch.embd && !cparams.cross_attn) {
 +                    continue;
 +                }
 +
@ -422,42 +440,28 @@ index 83b80b59..b189a19a 100644
 +                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 +                cb(Qcur, "Qcur", il);
 +
-+                Qcur = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-+                cb(Qcur, "Qcur", il);
-+
-+                // TODO: is this required?
-+                Qcur = ggml_cont(ctx0, Qcur);
+                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
 +                cb(Qcur, "Qcur", il);
 +
 +                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
 +                cb(Qcur, "Qcur", il);
 +
-+                struct ggml_tensor * Kcur;
-+                if (lctx.cross_attn_state_first_pass) {
+                struct ggml_tensor * Kcur, * Vcur;
+                if (batch.embd) {
 +                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
 +                    cb(Kcur, "Kcur", il);
 +
 +                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
 +                    cb(Kcur, "Kcur", il);
 +
-+                    Kcur = ggml_permute(ctx0, Kcur, 0, 2, 1, 3);
-+                    cb(Kcur, "Kcur", il);
-+
-+                    // TODO: is this required?
-+                    Kcur = ggml_cont(ctx0, Kcur);
+                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
 +                    cb(Kcur, "Kcur", il);
 +
 +                    Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
 +                    cb(Kcur, "Kcur", il);
 +
 +                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self.k_l[il]));
-+                } else {
-+                    Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
-+                    cb(Kcur, "Kcur (view)", il);
-+                }
 +
-+                struct ggml_tensor * Vcur;
-+                if (lctx.cross_attn_state_first_pass) {
 +                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
 +                    cb(Vcur, "Vcur", il);
 +
@ -469,6 +473,9 @@ index 83b80b59..b189a19a 100644
 +
 +                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self.v_l[il]));
 +                } else {
+                    Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
+                    cb(Kcur, "Kcur (view)", il);
+
 +                    Vcur = ggml_view_tensor(ctx0, kv_self.v_l[il]);
 +                    cb(Vcur, "Vcur (view)", il);
 +                }
@ -476,11 +483,8 @@ index 83b80b59..b189a19a 100644
 +                struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
 +                cb(kq, "kq", il);
 +
-+                kq = ggml_scale_inplace(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
-+                cb(kq, "kq_scaled", il);
-+
 +                // TODO: apply causal masks
-+                struct ggml_tensor * kq_soft_max = ggml_soft_max_inplace(ctx0, kq);
+                struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
 +                cb(kq_soft_max, "kq_soft_max", il);
 +
 +                Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
@ -620,7 +624,7 @@ index 83b80b59..b189a19a 100644
         // lm_head
         cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
-@@ -16501,6 +16907,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -16501,6 +16891,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_llama();
             } break;
@ -631,33 +635,48 @@ index 83b80b59..b189a19a 100644
         case LLM_ARCH_BAICHUAN:
             {
                 result = llm.build_baichuan();
-@@ -16773,6 +17183,14 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
-         ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
+@@ -16761,10 +17155,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
     }
 
-+    // TODO (jmorganca): this might copy a lot of data on every request of a
-+    // single generation even though it doesn't change, so we should
-+    // find a way to not set this more than one time per image
-+    if (lctx.inp_cross_attn_state &&
-+        lctx.inp_cross_attn_state->buffer) {
-+        ggml_backend_tensor_set(lctx.inp_cross_attn_state, lctx.cross_attn_state, 0, hparams.n_embd * 1601 * 4 * ggml_element_size(lctx.inp_cross_attn_state));
+     if (batch.embd) {
+-        const int64_t n_embd   = hparams.n_embd;
+-        const int64_t n_tokens = batch.n_tokens;
+        if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
+            ggml_backend_tensor_set(lctx.inp_cross_attn_state, batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
+            // zero out inp_embd since it's not used
+            float * inp_embd_data = (float *)lctx.inp_embd->data;
+            for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
+                inp_embd_data[i] = 0.0f;
 +            }
-+
-     if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-         GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
-         const int64_t n_tokens = batch.n_tokens;
-@@ -17455,6 +17873,10 @@ static int llama_decode_internal(
+        } else {
+            const int64_t n_embd   = hparams.n_embd;
+            const int64_t n_tokens = batch.n_tokens;
 
-         llama_set_inputs(lctx, ubatch);
+-        ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+            ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+        }
+     }
 
-+        // TODO: replace with something better to find out if its
-+        // our first actual pass
-+        lctx.cross_attn_state_first_pass = false;
-+
-         llama_graph_compute(lctx, gf, n_threads, threadpool);
+     if (batch.pos && lctx.inp_pos) {
+@@ -17345,7 +17748,7 @@ static int llama_decode_internal(
+         n_outputs = 1;
+     }
 
-         // update the kv ring buffer
-@@ -18648,7 +19070,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+-    lctx.sbatch.from_batch(batch_all, n_embd,
+    lctx.sbatch.from_batch(batch_all, batch_all.n_embd,
+         /* simple_split */ !kv_self.recurrent,
+         /* logits_all   */ n_outputs == n_tokens_all);
+ 
+@@ -17638,7 +18041,7 @@ static int llama_encode_internal(
+ 
+     const int64_t n_embd = hparams.n_embd;
+ 
+-    lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+    lctx.sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
+ 
+     const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
+ 
+@@ -18648,7 +19051,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         if (llama_model_has_encoder(&model)) {
             n_attn_layer *= 3;
         }
@ -668,19 +687,7 @@ index 83b80b59..b189a19a 100644
     }
 
     size_t total_size_org = 0;
-@@ -19744,6 +20168,11 @@ struct llama_context * llama_new_context_with_model(
-     return ctx;
- }
- 
-+void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state) {
-+    ctx->cross_attn_state_first_pass = true;
-+    ctx->cross_attn_state = cross_attn_state;
-+}
-+
- void llama_free(struct llama_context * ctx) {
-     delete ctx;
- }
-@@ -19814,6 +20243,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+@@ -19814,6 +20219,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA:
@ -688,3 +695,38 @@ index 83b80b59..b189a19a 100644
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
         case LLM_ARCH_PLAMO:
+@@ -21230,6 +21636,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
+     ctx->cparams.causal_attn = causal_attn;
+ }
+ 
+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
+    ctx->cparams.cross_attn = cross_attention;
+}
+
+ struct llama_batch llama_batch_get_one(
+              llama_token * tokens,
+                  int32_t   n_tokens,
+@@ -21239,6 +21649,7 @@ struct llama_batch llama_batch_get_one(
+         /*n_tokens       =*/ n_tokens,
+         /*tokens         =*/ tokens,
+         /*embd           =*/ nullptr,
+        /*n_embd         =*/ 0,
+         /*pos            =*/ nullptr,
+         /*n_seq_id       =*/ nullptr,
+         /*seq_id         =*/ nullptr,
+@@ -21254,6 +21665,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
+         /*n_tokens       =*/ 0,
+         /*tokens         =*/ nullptr,
+         /*embd           =*/ nullptr,
+        /*n_embd         =*/ 0,
+         /*pos            =*/ nullptr,
+         /*n_seq_id       =*/ nullptr,
+         /*seq_id         =*/ nullptr,
+@@ -21265,6 +21677,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
+ 
+     if (embd) {
+         batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
+        batch.n_embd = embd;
+     } else {
+         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
+     }
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@ -2,7 +2,6 @@ package main

 import (
 	"errors"
-	"hash/maphash"
 	"log/slog"
 	"reflect"
 	"time"
@ -20,10 +19,6 @@ type InputCache struct {
 	// optimize cache eviction for multiple users
 	multiUserCache bool

-	// cache of images to embeddings
-	images    []imageCache
-	imageHash maphash.Hash
-
 	lc *llama.Context
 }

@ -41,7 +36,6 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
 		numCtx:         kvSize / numSlots,
 		slots:          slots,
 		multiUserCache: multiUserCache,
-		images:         make([]imageCache, numSlots),
 		lc:             lc,
 	}
 }
@ -211,55 +205,3 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int, numDiscar
 	}
 	slot.Inputs = slot.Inputs[:len(slot.Inputs)-numDiscard]
 }
-
-// Locking: Lookup and store operations on imageCache require a lock
-// to be held that serializes these with each other. Hash does not
-// require a lock nor they need to be serialized with InputCacheSlot.
-
-type imageCache struct {
-	key      uint64
-	val      [][]float32
-	lastUsed time.Time
-}
-
-func (c *InputCache) HashImage(image []byte) uint64 {
-	c.imageHash.Reset()
-	_, _ = c.imageHash.Write(image)
-	return c.imageHash.Sum64()
-}
-
-var ErrImageNotFound = errors.New("image not found in cache")
-
-func (c *InputCache) FindImage(hash uint64) ([][]float32, error) {
-	for i := range c.images {
-		if c.images[i].key == hash {
-			slog.Debug("loading image embeddings from cache", "entry", i)
-			c.images[i].lastUsed = time.Now()
-			return c.images[i].val, nil
-		}
-	}
-
-	return nil, ErrImageNotFound
-}
-
-func (c *InputCache) AddImage(hash uint64, embed [][]float32) {
-	best := time.Now()
-	var bestImage int
-
-	for i := range c.images {
-		if c.images[i].key == hash {
-			bestImage = i
-			break
-		}
-
-		if c.images[i].lastUsed.Compare(best) < 0 {
-			best = c.images[i].lastUsed
-			bestImage = i
-		}
-	}
-
-	slog.Debug("storing image embeddings in cache", "entry", bestImage, "used", c.images[bestImage].lastUsed)
-	c.images[bestImage].key = hash
-	c.images[bestImage].val = embed
-	c.images[bestImage].lastUsed = time.Now()
-}
--- a/llama/runner/cache_test.go
+++ b/llama/runner/cache_test.go
@ -1,7 +1,6 @@
 package main

 import (
-	"reflect"
 	"testing"
 	"time"
 )
@ -228,77 +227,3 @@ func TestFindCacheSlot(t *testing.T) {
 		})
 	}
 }
-
-func TestImageCache(t *testing.T) {
-	cache := NewInputCache(nil, 2048, 4, false)
-
-	valA := [][]float32{{0.1, 0.2}, {0.3}}
-	valB := [][]float32{{0.4}, {0.5}, {0.6}}
-	valC := [][]float32{{0.7}}
-	valD := [][]float32{{0.8}}
-	valE := [][]float32{{0.9}}
-
-	// Empty cache
-	result, err := cache.FindImage(0x5adb61d31933a946)
-	if err != ErrImageNotFound {
-		t.Errorf("found result in empty cache: result %v, err %v", result, err)
-	}
-
-	// Insert A
-	cache.AddImage(0x5adb61d31933a946, valA)
-
-	result, err = cache.FindImage(0x5adb61d31933a946)
-	if !reflect.DeepEqual(result, valA) {
-		t.Errorf("failed to find expected value: result %v, err %v", result, err)
-	}
-
-	// Insert B
-	cache.AddImage(0x011551369a34a901, valB)
-
-	result, err = cache.FindImage(0x5adb61d31933a946)
-	if !reflect.DeepEqual(result, valA) {
-		t.Errorf("failed to find expected value: result %v, err %v", result, err)
-	}
-	result, err = cache.FindImage(0x011551369a34a901)
-	if !reflect.DeepEqual(result, valB) {
-		t.Errorf("failed to find expected value: result %v, err %v", result, err)
-	}
-
-	// Replace B with C
-	cache.AddImage(0x011551369a34a901, valC)
-
-	result, err = cache.FindImage(0x5adb61d31933a946)
-	if !reflect.DeepEqual(result, valA) {
-		t.Errorf("failed to find expected value: result %v, err %v", result, err)
-	}
-	result, err = cache.FindImage(0x011551369a34a901)
-	if !reflect.DeepEqual(result, valC) {
-		t.Errorf("failed to find expected value: result %v, err %v", result, err)
-	}
-
-	// Evict A
-	cache.AddImage(0x756b218a517e7353, valB)
-	cache.AddImage(0x75e5e8d35d7e3967, valD)
-	cache.AddImage(0xd96f7f268ca0646e, valE)
-
-	result, err = cache.FindImage(0x5adb61d31933a946)
-	if reflect.DeepEqual(result, valA) {
-		t.Errorf("failed to find expected value: result %v, err %v", result, err)
-	}
-	result, err = cache.FindImage(0x756b218a517e7353)
-	if !reflect.DeepEqual(result, valB) {
-		t.Errorf("failed to find expected value: result %v, err %v", result, err)
-	}
-	result, err = cache.FindImage(0x011551369a34a901)
-	if !reflect.DeepEqual(result, valC) {
-		t.Errorf("failed to find expected value: result %v, err %v", result, err)
-	}
-	result, err = cache.FindImage(0x75e5e8d35d7e3967)
-	if !reflect.DeepEqual(result, valD) {
-		t.Errorf("failed to find expected value: result %v, err %v", result, err)
-	}
-	result, err = cache.FindImage(0xd96f7f268ca0646e)
-	if !reflect.DeepEqual(result, valE) {
-		t.Errorf("failed to find expected value: result %v, err %v", result, err)
-	}
-}
--- a/llama/runner/image.go
+++ b/llama/runner/image.go
@ -0,0 +1,183 @@
+package main
+
+import (
+	"errors"
+	"fmt"
+	"hash/maphash"
+	"log/slog"
+	"slices"
+	"sync"
+	"time"
+
+	"github.com/ollama/ollama/llama"
+)
+
+const imageCacheSize = 4
+
+type ImageContext struct {
+	// mu is required to be held when generating embeddings or accessing the cache
+	mu sync.Mutex
+
+	clip   *llama.ClipContext
+	mllama *llama.MllamaContext
+
+	// cache of images to embeddings
+	images    []imageCache
+	imageHash maphash.Hash
+}
+
+func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageContext, error) {
+	arch, err := llama.GetModelArch(modelPath)
+	if err != nil {
+		return nil, fmt.Errorf("unable to determine vision architecture: %w (%s)", err, modelPath)
+	}
+
+	var c ImageContext
+	if arch == "clip" {
+		c.clip, err = llama.NewClipContext(llamaContext, modelPath)
+	} else if arch == "mllama" {
+		c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
+	} else {
+		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
+	}
+
+	if err != nil {
+		return nil, err
+	}
+
+	c.images = make([]imageCache, imageCacheSize)
+
+	return &c, nil
+}
+
+func (c *ImageContext) Free(modelPath string) {
+	if c == nil {
+		return
+	}
+
+	if c.clip != nil {
+		c.clip.Free()
+	}
+	if c.mllama != nil {
+		c.mllama.Free()
+	}
+}
+
+func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
+	if c == nil {
+		return nil, nil
+	}
+
+	if len(data) <= 0 {
+		return nil, errors.New("received zero length image")
+	}
+
+	hash := c.hashImage(data)
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	embed, err := c.findImage(hash)
+	if err != nil {
+		if c.mllama != nil {
+			embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
+			if err != nil {
+				return nil, err
+			}
+		} else if c.clip != nil {
+			embed, err = c.clip.NewEmbed(llamaContext, data)
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			return nil, errors.New("received image but vision model not loaded")
+		}
+
+		c.addImage(hash, embed)
+	}
+
+	return embed, nil
+}
+
+func (c *ImageContext) BatchSize(configuredBatchSize int) int {
+	// If images are not supported, we don't need to allocate embedding batches
+	if c == nil {
+		return 0
+	}
+
+	// Mllama maps an image to 1 embedding token (llava creates many tokens)
+	// and doesn't support more than a single image per request.
+	// The embeddings are large (100 MB), so allocating a big batch can fail
+	// on some systems
+	if c.mllama != nil {
+		return 1
+	}
+
+	return configuredBatchSize
+}
+
+func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
+	if c != nil && c.mllama != nil {
+		return c.mllama.EmbedSize(llamaContext)
+	} else {
+		return llamaContext.Model().NEmbd()
+	}
+}
+
+func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
+	if c == nil || c.mllama == nil {
+		return false
+	}
+
+	return slices.ContainsFunc(inputs, func(input input) bool {
+		return input.embed != nil
+	})
+}
+
+type imageCache struct {
+	key      uint64
+	val      [][]float32
+	lastUsed time.Time
+}
+
+func (c *ImageContext) hashImage(image []byte) uint64 {
+	c.imageHash.Reset()
+	_, _ = c.imageHash.Write(image)
+	return c.imageHash.Sum64()
+}
+
+var errImageNotFound = errors.New("image not found in cache")
+
+func (c *ImageContext) findImage(hash uint64) ([][]float32, error) {
+	for i := range c.images {
+		if c.images[i].key == hash {
+			slog.Debug("loading image embeddings from cache", "entry", i)
+			c.images[i].lastUsed = time.Now()
+			return c.images[i].val, nil
+		}
+	}
+
+	return nil, errImageNotFound
+}
+
+func (c *ImageContext) addImage(hash uint64, embed [][]float32) {
+	best := time.Now()
+	var bestImage int
+
+	for i := range c.images {
+		if c.images[i].key == hash {
+			bestImage = i
+			break
+		}
+
+		if c.images[i].lastUsed.Compare(best) < 0 {
+			best = c.images[i].lastUsed
+			bestImage = i
+		}
+	}
+
+	slog.Debug("storing image embeddings in cache", "entry", bestImage, "used", c.images[bestImage].lastUsed)
+	c.images[bestImage].key = hash
+	c.images[bestImage].val = embed
+	c.images[bestImage].lastUsed = time.Now()
+}
--- a/llama/runner/image_test.go
+++ b/llama/runner/image_test.go
@ -0,0 +1,80 @@
+package main
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestImageCache(t *testing.T) {
+	cache := ImageContext{images: make([]imageCache, 4)}
+
+	valA := [][]float32{{0.1, 0.2}, {0.3}}
+	valB := [][]float32{{0.4}, {0.5}, {0.6}}
+	valC := [][]float32{{0.7}}
+	valD := [][]float32{{0.8}}
+	valE := [][]float32{{0.9}}
+
+	// Empty cache
+	result, err := cache.findImage(0x5adb61d31933a946)
+	if err != errImageNotFound {
+		t.Errorf("found result in empty cache: result %v, err %v", result, err)
+	}
+
+	// Insert A
+	cache.addImage(0x5adb61d31933a946, valA)
+
+	result, err = cache.findImage(0x5adb61d31933a946)
+	if !reflect.DeepEqual(result, valA) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+
+	// Insert B
+	cache.addImage(0x011551369a34a901, valB)
+
+	result, err = cache.findImage(0x5adb61d31933a946)
+	if !reflect.DeepEqual(result, valA) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+	result, err = cache.findImage(0x011551369a34a901)
+	if !reflect.DeepEqual(result, valB) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+
+	// Replace B with C
+	cache.addImage(0x011551369a34a901, valC)
+
+	result, err = cache.findImage(0x5adb61d31933a946)
+	if !reflect.DeepEqual(result, valA) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+	result, err = cache.findImage(0x011551369a34a901)
+	if !reflect.DeepEqual(result, valC) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+
+	// Evict A
+	cache.addImage(0x756b218a517e7353, valB)
+	cache.addImage(0x75e5e8d35d7e3967, valD)
+	cache.addImage(0xd96f7f268ca0646e, valE)
+
+	result, err = cache.findImage(0x5adb61d31933a946)
+	if reflect.DeepEqual(result, valA) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+	result, err = cache.findImage(0x756b218a517e7353)
+	if !reflect.DeepEqual(result, valB) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+	result, err = cache.findImage(0x011551369a34a901)
+	if !reflect.DeepEqual(result, valC) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+	result, err = cache.findImage(0x75e5e8d35d7e3967)
+	if !reflect.DeepEqual(result, valD) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+	result, err = cache.findImage(0xd96f7f268ca0646e)
+	if !reflect.DeepEqual(result, valE) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+}
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@ -52,6 +52,10 @@ type Sequence struct {
 	// input cache being used by this sequence
 	cache *InputCacheSlot

+	// does this sequence require cross-attention layers to be processed? - if we have seen
+	// an image for certain multi-modal models
+	crossAttention bool
+
 	// channel to send responses over
 	responses chan string

@ -127,7 +131,10 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen

 	var sc *llama.SamplingContext
 	if params.samplingParams != nil {
-		sc = llama.NewSamplingContext(s.model, *params.samplingParams)
+		sc, err = llama.NewSamplingContext(s.model, *params.samplingParams)
+		if err != nil {
+			return nil, err
+		}
 		for _, input := range inputs {
 			if input.embed == nil {
 				sc.Accept(input.token, false)
@ -190,16 +197,10 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 				return nil, fmt.Errorf("invalid image index: %d", n)
 			}

-			hash := s.cache.HashImage(images[imageIndex].Data)
-
-			// Vision models cannot be accessed concurrently
-			s.clip.mu.Lock()
-			embed, err := s.cache.FindImage(hash)
+			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
 			if err != nil {
-				embed = llama.NewLlavaImageEmbed(s.lc, s.clip.cc, images[imageIndex].Data)
-				s.cache.AddImage(hash, embed)
+				return nil, err
 			}
-			s.clip.mu.Unlock()

 			for _, e := range embed {
 				inputs = append(inputs, input{embed: e})
@ -207,41 +208,17 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 		}
 	}

-	if s.clip.cc != nil {
-		var embed [][]float32
-
-		if s.clip.cc.IsMllama && len(images) >= 1 {
-			hash := s.cache.HashImage(images[0].Data)
-
-			s.clip.mu.Lock()
-			var err error
-			embed, err = s.cache.FindImage(hash)
-			if err != nil {
-				embed = llama.NewMllamaImageEmbed(s.lc, s.clip.cc, images[0].Data, images[0].AspectRatioID)
-				s.cache.AddImage(hash, embed)
-			}
-			s.clip.mu.Unlock()
-		}
-		s.mu.Lock()
-		llama.MllamaSetCrossAttn(s.lc, s.clip.cc, embed)
-		s.mu.Unlock()
-	}
-
 	return inputs, nil
 }

-type clip struct {
-	cc *llama.ClipContext
-	mu sync.Mutex
-}
-
 type Server struct {
 	model *llama.Model
 	lc    *llama.Context

 	// required for image embeddings
-	clip clip
+	image *ImageContext

+	// TODO (jmorganca): make this n_batch
 	batchSize int

 	// parallel is the number of parallel requests to handle
@ -327,22 +304,31 @@ func (s *Server) removeSequence(seqIndex int, reason string) {
 	close(seq.responses)
 	close(seq.embedding)
 	seq.cache.InUse = false
-	if s.clip.cc != nil {
-		llama.MllamaSetCrossAttn(s.lc, s.clip.cc, nil)
-	}
 	s.seqs[seqIndex] = nil
 }

 func (s *Server) run(ctx context.Context) {
 	s.ready.Wait()

-	// logically these batches are used only within the context of processBatch
+	// Logically these batches are used only within the context of processBatch
 	// but it is better for performance to allocate them once here
-	tokenBatch := llama.NewBatch(s.batchSize*len(s.seqs), 0, len(s.seqs))
+	tokenBatch, err := llama.NewBatch(s.batchSize, len(s.seqs), 0)
+	if err != nil {
+		panic(err)
+	}
 	defer tokenBatch.Free()

-	embedBatch := llama.NewBatch(s.batchSize*len(s.seqs), s.lc.Model().NEmbd(), len(s.seqs))
+	var embedBatch *llama.Batch
+	embedBatchSize := s.image.BatchSize(s.batchSize)
+	if embedBatchSize != 0 {
+		embedBatch, err = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc))
+		if err != nil {
+			panic(err)
+		}
 		defer embedBatch.Free()
+	} else {
+		embedBatch = &llama.Batch{}
+	}

 	for {
 		select {
@ -371,6 +357,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 	defer s.mu.Unlock()

 	var batch *llama.Batch
+	crossAttention := false

 	seqIdx := s.nextSeq - 1
 	for range s.seqs {
@ -404,18 +391,19 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 					batch = tokenBatch
 				} else {
 					batch = embedBatch
+					seq.crossAttention = s.image.NeedCrossAttention(input)
 				}
-			} else if embedding != batch.IsEmbedding() {
+			} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
 				s.nextSeq = seqIdx
 				break
 			}

-			// todo: make this n_batch
-			if i >= s.batchSize {
+			if i >= batch.Size() {
 				break
 			}

-			batch.Add(input.token, input.embed, seq.numPast, []int{seq.cache.Id}, numInputsProcessed+1 == len(seq.inputs))
+			crossAttention = seq.crossAttention
+			batch.Add(input.token, input.embed, seq.numPast, numInputsProcessed+1 == len(seq.inputs), seq.cache.Id)
 			seq.numPast++
 			numInputsProcessed++
 		}
@ -431,6 +419,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return
 	}

+	s.lc.SetCrossAttention(crossAttention)
+
 	err := s.lc.Decode(batch)
 	if err != nil {
 		slog.Error("failed to decode batch", "error", err)
@ -648,6 +638,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
 				return
 			}
+
+			seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
+
 			s.seqs[i] = seq
 			s.cond.Signal()
 			break
@ -815,7 +808,7 @@ func (s *Server) loadModel(

 	if ppath != "" {
 		var err error
-		s.clip.cc, err = llama.NewClipContext(ppath)
+		s.image, err = NewImageContext(s.lc, ppath)
 		if err != nil {
 			panic(err)
 		}
@ -844,14 +837,8 @@ func main() {
 	mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
-	// Expose requirements as a JSON output to stdout
 	requirements := flag.Bool("requirements", false, "print json requirement information")

-	// These are either ignored by llama.cpp or have no significance to us
-	_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
-	_ = flag.Bool("log-disable", false, "disables logging to a file")
-	_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
-
 	flag.Parse()
 	if *requirements {
 		printRequirements(os.Stdout)
@ -874,7 +861,7 @@ func main() {
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.Info("starting go runner")
-	slog.Debug("system info", "cpu", llama.PrintSystemInfo(), "threads", *threads)
+	slog.Info("system", "info", llama.PrintSystemInfo(), "threads", *threads)

 	server := &Server{
 		batchSize: *batchSize,
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@ -5,6 +5,7 @@
 struct gpt_sampler *gpt_sampler_cinit(
    const struct llama_model *model, struct gpt_sampler_cparams *params)
 {
+    try {
        gpt_sampler_params sparams;
        sparams.top_k = params->top_k;
        sparams.top_p = params->top_p;
@ -23,6 +24,9 @@ struct gpt_sampler *gpt_sampler_cinit(
        sparams.seed = params->seed;
        sparams.grammar = params->grammar;
        return gpt_sampler_init(model, sparams);
+    } catch (const std::exception & err) {
+        return nullptr;
+    }
 }

 void gpt_sampler_cfree(struct gpt_sampler *sampler)
--- a/llama/vendoring
+++ b/llama/vendoring
@ -0,0 +1 @@
+LLAMACPP_BASE_COMMIT=3f1ae2e32cde00c39b96be6d01c2997c29bae555
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@ -1,15 +0,0 @@
-set(TARGET ollama_llama_server)
-option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp utils.hpp httplib.h)
-install(TARGETS ${TARGET} RUNTIME)
-target_compile_definitions(${TARGET} PRIVATE
-    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
-)
-target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
-if (WIN32)
-    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
-    target_link_options(${TARGET} PRIVATE -municode -Wl,/subsystem:console)
-endif()
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/llm/ext_server/httplib.h
+++ b/llm/ext_server/httplib.h
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
--- a/llm/ext_server/utils.hpp
+++ b/llm/ext_server/utils.hpp
@ -1,661 +0,0 @@
-// MIT License
-
-// Copyright (c) 2023 Georgi Gerganov
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include <set>
-#include <mutex>
-#include <condition_variable>
-#include <unordered_map>
-#include <random>
-#include <iostream>
-#include <thread>
-
-#include "json.hpp"
-
-#include "../llava/clip.h"
-
-using json = nlohmann::json;
-
-extern bool server_verbose;
-extern bool server_log_json;
-
-#ifndef SERVER_VERBOSE
-#define SERVER_VERBOSE 1
-#endif
-
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                            \
-    do                                                                   \
-    {                                                                    \
-        if (server_verbose)                                              \
-        {                                                                \
-            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
-        }                                                                \
-    } while (0)
-#endif
-
-#define LOG_ERROR(  MSG, ...) server_log("ERROR",  __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_DEBUG(  MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__)
-
-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-    SERVER_STATE_ERROR           // An error occurred, load_model failed
-};
-
-enum task_type {
-    TASK_TYPE_COMPLETION,
-    TASK_TYPE_CANCEL,
-    TASK_TYPE_NEXT_RESPONSE,
-    TASK_TYPE_METRICS
-};
-
-struct task_server {
-    int id = -1; // to be filled by llama_server_queue
-    int target_id;
-    task_type type;
-    json data;
-    bool infill_mode = false;
-    bool embedding_mode = false;
-    int multitask_id = -1;
-};
-
-struct task_result {
-    int id;
-    int multitask_id = -1;
-    bool stop;
-    bool error;
-    json result_json;
-};
-
-struct task_multi {
-    int id;
-    std::set<int> subtasks_remaining{};
-    std::vector<task_result> results{};
-};
-
-// completion token output with probabilities
-struct completion_token_output {
-    struct token_prob
-    {
-        llama_token tok;
-        float prob;
-    };
-
-    std::vector<token_prob> probs;
-    llama_token tok;
-    std::string text_to_send;
-};
-
-struct token_translator {
-    llama_context * ctx;
-    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
-    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
-};
-
-static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
-    std::stringstream ss_tid;
-    ss_tid << std::this_thread::get_id();
-    json log = nlohmann::ordered_json{
-        {"tid", ss_tid.str()},
-        {"timestamp", time(nullptr)},
-    };
-
-    if (strncmp("DEBUG", level, strlen(level)) == 0 && !server_verbose) {
-        return;
-    }
-
-    if (server_log_json) {
-        log.merge_patch(
-                {
-                        {"level",     level},
-                        {"function",  function},
-                        {"line",      line},
-                        {"msg",       message},
-                });
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-
-        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
-    } else {
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-
-        std::stringstream ss;
-        ss << level << " [" << function << "] " << message << " |";
-        for (const auto& el : log.items())
-        {
-            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
-            ss << " " << el.key() << "=" << value;
-        }
-
-        const std::string str = ss.str();
-        printf("%.*s\n", (int)str.size(), str.data());
-        fflush(stdout);
-    }
-}
-
-//
-// server utils
-//
-
-template <typename T>
-static T json_value(const json &body, const std::string &key, const T &default_value) {
-    // Fallback null to default value
-    return body.contains(key) && !body.at(key).is_null()
-        ? body.value(key, default_value)
-        : default_value;
-}
-
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-inline bool verify_custom_template(const std::string & tmpl) {
-    llama_chat_message chat[] = {{"user", "test"}};
-    std::vector<char> buf(1);
-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
-    return res >= 0;
-}
-
-// Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
-    size_t alloc_size = 0;
-    // vector holding all allocated string to be passed to llama_chat_apply_template
-    std::vector<std::string> str(messages.size() * 2);
-    std::vector<llama_chat_message> chat(messages.size());
-
-    for (size_t i = 0; i < messages.size(); ++i) {
-        auto &curr_msg = messages[i];
-        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
-        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
-        alloc_size     += str[i*2 + 1].length();
-        chat[i].role    = str[i*2 + 0].c_str();
-        chat[i].content = str[i*2 + 1].c_str();
-    }
-
-    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
-    std::vector<char> buf(alloc_size * 2);
-
-    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-
-    // if it turns out that our buffer is too small, we resize it
-    if ((size_t) res > buf.size()) {
-        buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-    }
-
-    std::string formatted_chat(buf.data(), res);
-    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
-
-    return formatted_chat;
-}
-
-//
-// work queue utils
-//
-
-struct llama_server_queue {
-    int id = 0;
-    std::mutex mutex_tasks;
-    bool running;
-    // queues
-    std::vector<task_server> queue_tasks;
-    std::vector<task_server> queue_tasks_deferred;
-    std::vector<task_multi> queue_multitasks;
-    std::condition_variable condition_tasks;
-    // callback functions
-    std::function<void(task_server&)> callback_new_task;
-    std::function<void(task_multi&)> callback_finish_multitask;
-    std::function<void(void)> callback_run_slots;
-
-    // Add a new task to the end of the queue
-    int post(task_server task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        if (task.id == -1) {
-            task.id = id++;
-            LOG_VERBOSE("new task id", {{"new_id", task.id}});
-        }
-        queue_tasks.push_back(std::move(task));
-        condition_tasks.notify_one();
-        return task.id;
-    }
-
-    // Add a new task, but defer until one slot is available
-    void defer(task_server task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        queue_tasks_deferred.push_back(std::move(task));
-    }
-
-    // Get the next id for creating anew task
-    int get_new_id() {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        int new_id = id++;
-        LOG_VERBOSE("new task id", {{"new_id", new_id}});
-        return new_id;
-    }
-
-    // Register function to process a new task
-    void on_new_task(std::function<void(task_server&)> callback) {
-        callback_new_task = callback;
-    }
-
-    // Register function to process a multitask when it is finished
-    void on_finish_multitask(std::function<void(task_multi&)> callback) {
-        callback_finish_multitask = callback;
-    }
-
-    // Register the function to be called when all slots data is ready to be processed
-    void on_run_slots(std::function<void(void)> callback) {
-        callback_run_slots = callback;
-    }
-
-    // Call when the state of one slot is changed
-    void notify_slot_changed() {
-        // move deferred tasks back to main loop
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        for (auto & task : queue_tasks_deferred) {
-            queue_tasks.push_back(std::move(task));
-        }
-        queue_tasks_deferred.clear();
-    }
-
-    // end the start_loop routine
-    void terminate() {
-        {
-            std::unique_lock<std::mutex> lock(mutex_tasks);
-            running = false;
-        }
-        condition_tasks.notify_all();
-    }
-
-    /**
-     * Main loop consists of these steps:
-     * - Wait until a new task arrives
-     * - Process the task (i.e. maybe copy data into slot)
-     * - Check if multitask is finished
-     * - Run all slots
-     */
-    void start_loop() {
-        running = true;
-        while (true) {
-            LOG_VERBOSE("new task may arrive", {});
-            {
-                while (true)
-                {
-                    std::unique_lock<std::mutex> lock(mutex_tasks);
-                    if (queue_tasks.empty()) {
-                        lock.unlock();
-                        break;
-                    }
-                    task_server task = queue_tasks.front();
-                    queue_tasks.erase(queue_tasks.begin());
-                    lock.unlock();
-                    LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
-                    callback_new_task(task);
-                }
-                LOG_VERBOSE("update_multitasks", {});
-                // check if we have any finished multitasks
-                auto queue_iterator = queue_multitasks.begin();
-                while (queue_iterator != queue_multitasks.end())
-                {
-                    if (queue_iterator->subtasks_remaining.empty())
-                    {
-                        // all subtasks done == multitask is done
-                        task_multi current_multitask = *queue_iterator;
-                        callback_finish_multitask(current_multitask);
-                        // remove this multitask
-                        queue_iterator = queue_multitasks.erase(queue_iterator);
-                    }
-                    else
-                    {
-                        ++queue_iterator;
-                    }
-                }
-                // all tasks in the current loop is processed, slots data is now ready
-                LOG_VERBOSE("callback_run_slots", {});
-                callback_run_slots();
-            }
-            LOG_VERBOSE("wait for new task", {});
-            // wait for new task
-            {
-                std::unique_lock<std::mutex> lock(mutex_tasks);
-                if (queue_tasks.empty()) {
-                    if (!running) {
-                        LOG_VERBOSE("ending start_loop", {});
-                        return;
-                    }
-                    condition_tasks.wait(lock, [&]{
-                        return (!queue_tasks.empty() || !running);
-                    });
-                }
-            }
-        }
-    }
-
-    //
-    // functions to manage multitasks
-    //
-
-    // add a multitask by specifying the id of all subtask (subtask is a task_server)
-    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        task_multi multi;
-        multi.id = multitask_id;
-        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
-        queue_multitasks.push_back(multi);
-    }
-
-    // updatethe remaining subtasks, while appending results to multitask
-    void update_multitask(int multitask_id, int subtask_id, task_result& result)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        for (auto& multitask : queue_multitasks)
-        {
-            if (multitask.id == multitask_id)
-            {
-                multitask.subtasks_remaining.erase(subtask_id);
-                multitask.results.push_back(result);
-            }
-        }
-    }
-};
-
-struct llama_server_response {
-    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
-    callback_multitask_t callback_update_multitask;
-    // for keeping track of all tasks waiting for the result
-    std::set<int> waiting_task_ids;
-    // the main result queue
-    std::vector<task_result> queue_results;
-    std::mutex mutex_results;
-    std::condition_variable condition_results;
-
-    // add the task_id to the list of tasks waiting for response
-    void add_waiting_task_id(int task_id) {
-        LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.insert(task_id);
-    }
-
-    // when the request is finished, we can remove task associated with it
-    void remove_waiting_task_id(int task_id) {
-        LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.erase(task_id);
-    }
-
-    // This function blocks the thread until there is a response for this task_id
-    task_result recv(int task_id) {
-        while (true)
-        {
-            std::unique_lock<std::mutex> lock(mutex_results);
-            condition_results.wait(lock, [&]{
-                return !queue_results.empty();
-            });
-
-            for (int i = 0; i < (int) queue_results.size(); i++)
-            {
-                if (queue_results[i].id == task_id)
-                {
-                    assert(queue_results[i].multitask_id == -1);
-                    task_result res = queue_results[i];
-                    queue_results.erase(queue_results.begin() + i);
-                    return res;
-                }
-            }
-        }
-
-        // should never reach here
-    }
-
-    // Register the function to update multitask
-    void on_multitask_update(callback_multitask_t callback) {
-        callback_update_multitask = callback;
-    }
-
-    // Send a new result to a waiting task_id
-    void send(task_result result) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-        LOG_VERBOSE("send new result", {{"task_id", result.id}});
-        for (auto& task_id : waiting_task_ids) {
-            // LOG_TEE("waiting task id %i \n", task_id);
-            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
-            if (result.multitask_id == task_id)
-            {
-                LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
-                callback_update_multitask(task_id, result.id, result);
-                continue;
-            }
-
-            if (result.id == task_id)
-            {
-                LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
-                queue_results.push_back(result);
-                condition_results.notify_all();
-                return;
-            }
-        }
-    }
-};
-
-//
-// base64 utils (TODO: move to common in the future)
-//
-
-static const std::string base64_chars =
-             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "abcdefghijklmnopqrstuvwxyz"
-             "0123456789+/";
-
-static inline bool is_base64(uint8_t c)
-{
-    return (isalnum(c) || (c == '+') || (c == '/'));
-}
-
-static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
-{
-    int i = 0;
-    int j = 0;
-    int in_ = 0;
-
-    int in_len = encoded_string.size();
-
-    uint8_t char_array_4[4];
-    uint8_t char_array_3[3];
-
-    std::vector<uint8_t> ret;
-
-    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
-    {
-        char_array_4[i++] = encoded_string[in_]; in_++;
-        if (i == 4)
-        {
-            for (i = 0; i <4; i++)
-            {
-                char_array_4[i] = base64_chars.find(char_array_4[i]);
-            }
-
-            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-
-            for (i = 0; (i < 3); i++)
-            {
-                ret.push_back(char_array_3[i]);
-            }
-            i = 0;
-        }
-    }
-
-    if (i)
-    {
-        for (j = i; j <4; j++)
-        {
-            char_array_4[j] = 0;
-        }
-
-        for (j = 0; j <4; j++)
-        {
-            char_array_4[j] = base64_chars.find(char_array_4[j]);
-        }
-
-        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-
-        for (j = 0; (j < i - 1); j++)
-        {
-            ret.push_back(char_array_3[j]);
-        }
-    }
-
-    return ret;
-}
-
-//
-// random string / id
-//
-
-static std::string random_string()
-{
-    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
-
-    std::random_device rd;
-    std::mt19937 generator(rd());
-
-    std::string result(32, ' ');
-
-    for (int i = 0; i < 32; ++i) {
-        result[i] = str[generator() % str.size()];
-    }
-
-    return result;
-}
-
-static std::string gen_chatcmplid()
-{
-    std::stringstream chatcmplid;
-    chatcmplid << "chatcmpl-" << random_string();
-    return chatcmplid.str();
-}
-
-//
-// other common utils
-//
-
-static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
-{
-    size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
-    {
-    }
-    return i;
-}
-
-static bool ends_with(const std::string &str, const std::string &suffix)
-{
-    return str.size() >= suffix.size() &&
-           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
-
-static size_t find_partial_stop_string(const std::string &stop,
-                                       const std::string &text)
-{
-    if (!text.empty() && !stop.empty())
-    {
-        const char text_last_char = text.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
-        {
-            if (stop[char_index] == text_last_char)
-            {
-                const std::string current_partial = stop.substr(0, char_index + 1);
-                if (ends_with(text, current_partial))
-                {
-                    return text.size() - char_index - 1;
-                }
-            }
-        }
-    }
-    return std::string::npos;
-}
-
-// TODO: reuse llama_detokenize
-template <class Iter>
-static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
-{
-    std::string ret;
-    for (; begin != end; ++begin)
-    {
-        ret += llama_token_to_piece(ctx, *begin);
-    }
-    return ret;
-}
-
-// format incomplete utf-8 multibyte character for output
-static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
-{
-    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
-    // if the size is 1 and first bit is 1, meaning it's a partial character
-    //   (size > 1 meaning it's already a known token)
-    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
-    {
-        std::stringstream ss;
-        ss << std::hex << (out[0] & 0xff);
-        std::string res(ss.str());
-        out = "byte: \\x" + res;
-    }
-    return out;
-}
-
-// convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
-{
-    json out = json::array();
-    for (const auto &prob : probs)
-    {
-        json probs_for_token = json::array();
-        for (const auto &p : prob.probs)
-        {
-            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-            probs_for_token.push_back(json
-            {
-                {"tok_str", tok_str},
-                {"prob",    p.prob},
-            });
-        }
-        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
-        out.push_back(json{
-            {"content", tok_str},
-            {"probs",   probs_for_token},
-        });
-    }
-    return out;
-}
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@ -1,137 +0,0 @@
-# common logic across linux and darwin
-
-init_vars() {
-    case "${GOARCH}" in
-    "amd64")
-        ARCH="x86_64"
-        ;;
-    "arm64")
-        ARCH="arm64"
-        ;;
-    *)
-        echo "GOARCH must be set"
-        echo "this script is meant to be run from within go generate"
-        exit 1
-        ;;
-    esac
-
-    LLAMACPP_DIR=../llama.cpp
-    CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
-    CMAKE_TARGETS="--target ollama_llama_server"
-    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
-    else
-        # TODO - add additional optimization flags...
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
-    fi
-    case $(uname -s) in
-    "Darwin")
-        LIB_EXT="dylib"
-        WHOLE_ARCHIVE="-Wl,-force_load"
-        NO_WHOLE_ARCHIVE=""
-        GCC_ARCH="-arch ${ARCH}"
-        DIST_BASE=../../dist/darwin-${GOARCH}/
-        PAYLOAD_BASE=../../build/darwin/${GOARCH}
-        ;;
-    "Linux")
-        LIB_EXT="so"
-        WHOLE_ARCHIVE="-Wl,--whole-archive"
-        NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
-
-        # Cross compiling not supported on linux - Use docker
-        GCC_ARCH=""
-        DIST_BASE=../../dist/linux-${GOARCH}/
-        PAYLOAD_BASE=../../build/linux/${GOARCH}
-        ;;
-    *)
-        ;;
-    esac
-    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
-        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
-    fi
-    GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
-    RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
-}
-
-git_module_setup() {
-    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
-        echo "Skipping submodule initialization"
-        return
-    fi
-    # Make sure the tree is clean after the directory moves
-    if [ -d "${LLAMACPP_DIR}/gguf" ]; then
-        echo "Cleaning up old submodule"
-        rm -rf ${LLAMACPP_DIR}
-    fi
-    git submodule init
-    git submodule update --force ${LLAMACPP_DIR}
-
-}
-
-apply_patches() {
-    # apply temporary patches until fix is upstream
-    for patch in ../patches/*.patch; do
-        git -c 'user.name=nobody' -c 'user.email=<>' -C ${LLAMACPP_DIR} am ${patch}
-    done
-}
-
-build() {
-    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
-    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
-    # remove unnecessary build artifacts
-    rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
-}
-
-dist() {
-    [ -z "${RUNNER}" ] && exit 1
-    mkdir -p ${RUNNER_BASE}/${RUNNER}/
-    for f in ${BUILD_DIR}/bin/* ; do
-        cp ${f} ${RUNNER_BASE}/${RUNNER}/
-    done
-    # check for lib directory
-    if [ -d ${BUILD_DIR}/lib ]; then
-        for f in ${BUILD_DIR}/lib/* ; do
-            cp ${f} ${RUNNER_BASE}/${RUNNER}/
-        done
-    fi
-}
-
-# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
-compress() {
-    [ -z "${RUNNER}" ] && exit 1
-    echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
-    rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
-    mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
-    for f in ${BUILD_DIR}/bin/* ; do
-        ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
-        compress_pids+=" $!"
-    done
-    # check for lib directory
-    if [ -d ${BUILD_DIR}/lib ]; then
-        for f in ${BUILD_DIR}/lib/* ; do
-            ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
-            compress_pids+=" $!"
-        done
-    fi
-    echo
-}
-
-wait_for_compress() {
-    for pid in ${compress_pids}; do
-        wait $pid
-    done
-    echo "Finished compression"
-}
-
-install() {
-    echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
-    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do
-        rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
-        cp -af "${lib}" "${BUILD_DIR}/bin/"
-    done
-}
-
-# Keep the local tree clean after we're done with the build
-cleanup() {
-    git submodule update --force ${LLAMACPP_DIR}
-}
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@ -1,91 +0,0 @@
-#!/bin/bash
-# This script is intended to run inside the go generate
-# working directory must be ./llm/generate/
-
-# TODO - add hardening to detect missing tools (cmake, etc.)
-
-set -ex
-set -o pipefail
-compress_pids=""
-echo "Starting darwin generate script"
-source $(dirname $0)/gen_common.sh
-init_vars
-git_module_setup
-apply_patches
-
-sign() {
-    if [ -n "$APPLE_IDENTITY" ]; then
-        codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1
-    fi
-}
-
-COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
-
-case "${GOARCH}" in
-"amd64")
-    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DGGML_METAL=off -DGGML_NATIVE=off"
-
-    if [ -z "$OLLAMA_SKIP_CPU_GENERATE" ]; then
-        #
-        # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
-        #
-        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        RUNNER=cpu
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
-        echo "Building LCD CPU"
-        build
-        sign ${BUILD_DIR}/bin/ollama_llama_server
-        compress
-
-        #
-        # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
-        # Approximately 400% faster than LCD on same CPU
-        #
-        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        RUNNER=cpu_avx
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
-        echo "Building AVX CPU"
-        build
-        sign ${BUILD_DIR}/bin/ollama_llama_server
-        compress
-
-        #
-        # ~2013 CPU Dynamic library
-        # Approximately 10% faster than AVX on same CPU
-        #
-        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-        RUNNER=cpu_avx2
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
-        echo "Building AVX2 CPU"
-        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
-        build
-        sign ${BUILD_DIR}/bin/ollama_llama_server
-        compress
-    fi
-    ;;
-"arm64")
-
-    if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
-        init_vars
-        CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
-        RUNNER="metal"
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
-        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
-        build
-        sign ${BUILD_DIR}/bin/ollama_llama_server
-        compress
-    fi
-    ;;
-*)
-    echo "GOARCH must be set"
-    echo "this script is meant to be run from within go generate"
-    exit 1
-    ;;
-esac
-
-cleanup
-wait_for_compress
-echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -1,285 +0,0 @@
-#!/bin/bash
-# This script is intended to run inside the go generate
-# working directory must be llm/generate/
-
-# First we build one or more CPU based LLM libraries
-#
-# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
-# library dependencies
-#
-# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  The ROCM
-# libraries are quite large, and also dynamically load data files at runtime
-# which in turn are large, so we don't attempt to cary them as payload
-
-set -ex
-set -o pipefail
-compress_pids=""
-
-# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
-amdGPUs() {
-    if [ -n "${AMDGPU_TARGETS}" ]; then
-        echo "${AMDGPU_TARGETS}"
-        return
-    fi
-    GPU_LIST=(
-        "gfx900"
-        "gfx906:xnack-"
-        "gfx908:xnack-"
-        "gfx90a:xnack+"
-        "gfx90a:xnack-"
-        "gfx940"
-        "gfx941"
-        "gfx942"
-        "gfx1010"
-        "gfx1012"
-        "gfx1030"
-        "gfx1100"
-        "gfx1101"
-        "gfx1102"
-    )
-    (
-        IFS=$';'
-        echo "'${GPU_LIST[*]}'"
-    )
-}
-
-echo "Starting linux generate script"
-if [ -z "${CUDACXX}" ]; then
-    if [ -x /usr/local/cuda/bin/nvcc ]; then
-        export CUDACXX=/usr/local/cuda/bin/nvcc
-    else
-        # Try the default location in case it exists
-        export CUDACXX=$(command -v nvcc)
-    fi
-fi
-COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
-source $(dirname $0)/gen_common.sh
-init_vars
-git_module_setup
-apply_patches
-
-init_vars
-if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
-    # Users building from source can tune the exact flags we pass to cmake for configuring
-    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
-    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
-        init_vars
-        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
-        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        RUNNER="cpu"
-        BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-        echo "Building custom CPU"
-        build
-        install
-        dist
-        compress
-    else
-        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
-        # -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
-        # -DGGML_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
-        # -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
-        # -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
-        # Note: the following seem to yield slower results than AVX2 - ymmv
-        # -DGGML_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
-        # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
-        # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
-
-        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
-        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
-            #
-            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
-            #
-            init_vars
-            CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-            RUNNER=cpu
-            BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-            echo "Building LCD CPU"
-            build
-            install
-            dist
-            compress
-        fi
-
-        if [ "${ARCH}" == "x86_64" ]; then
-            #
-            # ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
-            #
-            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
-                #
-                # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
-                # Approximately 400% faster than LCD on same CPU
-                #
-                init_vars
-                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-                RUNNER=cpu_avx
-                BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-                echo "Building AVX CPU"
-                build
-                install
-                dist
-                compress
-            fi
-
-            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
-                #
-                # ~2013 CPU Dynamic library
-                # Approximately 10% faster than AVX on same CPU
-                #
-                init_vars
-                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-                RUNNER=cpu_avx2
-                BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-                echo "Building AVX2 CPU"
-                build
-                install
-                dist
-                compress
-            fi
-        fi
-    fi
-else
-    echo "Skipping CPU generation step as requested"
-fi
-
-# If needed, look for the default CUDA toolkit location
-if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
-    CUDA_LIB_DIR=/usr/local/cuda/lib64
-fi
-
-# If needed, look for CUDA on Arch Linux
-if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
-    CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
-fi
-
-# Allow override in case libcudart is in the wrong place
-if [ -z "${CUDART_LIB_DIR}" ]; then
-    CUDART_LIB_DIR="${CUDA_LIB_DIR}"
-fi
-
-if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
-    echo "CUDA libraries detected - building dynamic CUDA library"
-    init_vars
-    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
-    if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
-        CUDA_VARIANT=_v${CUDA_MAJOR}
-    fi
-    if [ "${ARCH}" == "arm64" ]; then
-        echo "ARM CPU detected - disabling unsupported AVX instructions"
-
-        # ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
-        #
-        # CUDA compute < 6.0 lacks proper FP16 support on ARM.
-        # Disabling has minimal performance effect while maintaining compatibility.
-        ARM64_DEFS="-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_CUDA_F16=off"
-    fi
-    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
-    if [ -n "${OLLAMA_CUSTOM_CUDA_DEFS}" ]; then
-        echo "OLLAMA_CUSTOM_CUDA_DEFS=\"${OLLAMA_CUSTOM_CUDA_DEFS}\""
-        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
-        echo "Building custom CUDA GPU"
-    else
-        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
-    fi
-    export CUDAFLAGS="-t8"
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
-    RUNNER=cuda${CUDA_VARIANT}
-    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
-    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
-    build
-    install
-    dist
-    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
-    mkdir -p "${CUDA_DIST_DIR}"
-    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
-        cp -a "${lib}" "${CUDA_DIST_DIR}"
-    done
-    compress
-
-fi
-
-if [ -z "${ONEAPI_ROOT}" ]; then
-    # Try the default location in case it exists
-    ONEAPI_ROOT=/opt/intel/oneapi
-fi
-
-if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
-    echo "OneAPI libraries detected - building dynamic OneAPI library"
-    init_vars
-    source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
-    CC=icx
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
-    RUNNER=oneapi
-    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
-    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
-    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
-    build
-
-    # copy oneAPI dependencies
-    mkdir -p "${ONEAPI_DIST_DIR}"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
-        cp -a "${dep}" "${ONEAPI_DIST_DIR}"
-    done
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
-    install
-    dist
-    compress
-fi
-
-if [ -z "${ROCM_PATH}" ]; then
-    # Try the default location in case it exists
-    ROCM_PATH=/opt/rocm
-fi
-
-if [ -z "${CLBlast_DIR}" ]; then
-    # Try the default location in case it exists
-    if [ -d /usr/lib/cmake/CLBlast ]; then
-        export CLBlast_DIR=/usr/lib/cmake/CLBlast
-    fi
-fi
-
-if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
-    echo "ROCm libraries detected - building dynamic ROCm library"
-    if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
-        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
-    fi
-    init_vars
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
-    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
-    if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
-        echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
-        CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
-        echo "Building custom ROCM GPU"
-    fi
-    RUNNER=rocm${ROCM_VARIANT}
-    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-    # ROCm dependencies are too large to fit into a unified bundle
-    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
-    # TODO figure out how to disable runpath (rpath)
-    # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
-    export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
-    build
-
-    # copy the ROCM dependencies
-    mkdir -p "${ROCM_DIST_DIR}"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
-        cp -a "${dep}"* "${ROCM_DIST_DIR}"
-        if [ $(readlink -f "${dep}") != "${dep}" ] ; then
-            cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
-        fi
-    done
-    install
-    dist
-    compress
-fi
-
-cleanup
-wait_for_compress
-echo "go generate completed.  LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -1,403 +0,0 @@
-#!powershell
-
-$ErrorActionPreference = "Stop"
-
-function amdGPUs {
-    if ($env:AMDGPU_TARGETS) {
-        return $env:AMDGPU_TARGETS
-    }
-    # Current supported rocblas list from ROCm v6.1.2 on windows
-    # https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus
-    $GPU_LIST = @(
-        "gfx1030"
-        "gfx1100"
-        "gfx1101"
-        "gfx1102"
-    )
-    $GPU_LIST -join ';'
-}
-
-
-function init_vars {
-    write-host "Checking for cmake..."
-    get-command cmake
-    write-host "Checking for ninja..."
-    $d=(get-command -ea 'silentlycontinue' ninja).path
-    if ($null -eq $d) {
-        $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
-        $matches=(gci -path $MSVC_INSTALL -r -fi ninja.exe)
-        if ($matches.count -eq 0) {
-            throw "Unable to locate ninja"
-        }
-        $ninjaDir=($matches[0].FullName | split-path -parent)
-        $env:PATH="$env:PATH;$ninjaDir"
-    }
-    if (!$script:SRC_DIR) {
-        $script:SRC_DIR = $(resolve-path "..\..\")
-    }
-    if (!$script:llamacppDir) {
-        $script:llamacppDir = "../llama.cpp"
-    }
-    if (!$script:cmakeTargets) {
-        $script:cmakeTargets = @("ollama_llama_server")
-    }
-    $script:cmakeDefs = @(
-        "-DBUILD_SHARED_LIBS=on",
-        "-DGGML_NATIVE=off",
-        "-DGGML_OPENMP=off"
-        )
-    $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
-    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
-    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
-    md "$script:DIST_BASE" -ea 0 > $null
-    if ($env:CGO_CFLAGS -contains "-g") {
-        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
-        $script:config = "RelWithDebInfo"
-    } else {
-        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off", "-DCMAKE_BUILD_TYPE=Release")
-        $script:config = "Release"
-    }
-    if ($null -ne $env:CMAKE_SYSTEM_VERSION) {
-        $script:cmakeDefs += @("-DCMAKE_SYSTEM_VERSION=${env:CMAKE_SYSTEM_VERSION}")
-    }
-    # Try to find the CUDA dir
-    if ($env:CUDA_LIB_DIR -eq $null) {
-        $d=(get-command -ea 'silentlycontinue' nvcc).path
-        if ($d -ne $null) {
-            $script:CUDA_LIB_DIR=($d| split-path -parent)
-            $script:CUDA_INCLUDE_DIR=($script:CUDA_LIB_DIR|split-path -parent)+"\include"
-        }
-    } else {
-        $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
-    }
-    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
-    if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
-        $script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
-    } else {
-        $script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
-    }
-    # Note: Windows Kits 10 signtool crashes with GCP's plugin
-    if ($null -eq $env:SIGN_TOOL) {
-        ${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
-    } else {
-        ${script:SignTool}=${env:SIGN_TOOL}
-    }
-    if ("${env:KEY_CONTAINER}") {
-        ${script:OLLAMA_CERT}=$(resolve-path "${script:SRC_DIR}\ollama_inc.crt")
-    }
-}
-
-function git_module_setup {
-    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
-    & git submodule init
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    & git submodule update --force "${script:llamacppDir}"
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-}
-
-function apply_patches {
-    # Apply temporary patches until fix is upstream
-    foreach ($patch in $(Get-ChildItem "../patches/*.patch")) {
-        git -c 'user.name=nobody' -c 'user.email=<>' -C "${script:llamacppDir}" am $patch.FullName
-    }
-}
-
-function build {
-    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
-    & cmake --version
-    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    if ($cmakeDefs -contains "-G") {
-        $extra=@("-j8")
-    } else {
-        $extra= @("--", "/maxCpuCount:8")
-    }
-    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
-    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    # Rearrange output to be consistent between different generators
-    if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
-        mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
-        remove-item "${script:buildDir}/bin/${script:config}"
-    }
-}
-
-function sign {
-    if ("${env:KEY_CONTAINER}") {
-        write-host "Signing ${script:buildDir}/bin/*.exe  ${script:buildDir}/bin/*.dll"
-        foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
-            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
-                /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
-    }
-}
-
-function install {
-    write-host "Installing binaries to dist dir ${script:distDir}"
-    mkdir ${script:distDir} -ErrorAction SilentlyContinue
-    $binaries = dir "${script:buildDir}/bin/*.exe"
-    foreach ($file in $binaries) {
-        copy-item -Path $file -Destination ${script:distDir} -Force
-    }
-
-    write-host "Installing dlls to dist dir ${script:distDir}"
-    $dlls = dir "${script:buildDir}/bin/*.dll"
-    foreach ($file in $dlls) {
-        copy-item -Path $file -Destination ${script:distDir} -Force
-    }
-}
-
-function cleanup {
-    $patches = Get-ChildItem "../patches/*.diff"
-    foreach ($patch in $patches) {
-        # Extract file paths from the patch file
-        $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
-            $parts = $_ -split ' '
-            ($parts[1] -split '/', 2)[1]
-        }
-
-        # Checkout each file
-        foreach ($file in $filePaths) {
-            git -C "${script:llamacppDir}" checkout $file
-        }
-        git -C "${script:llamacppDir}" checkout CMakeLists.txt
-    }
-}
-
-
-# -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
-# -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
-# -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
-
-
-function build_cpu_x64 {
-    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
-        init_vars
-        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=off", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
-        $script:buildDir="../build/windows/${script:ARCH}/cpu"
-        $script:distDir="$script:DIST_BASE\cpu"
-        write-host "Building LCD CPU"
-        build
-        sign
-        install
-    } else {
-        write-host "Skipping CPU generation step as requested"
-    }
-}
-
-function build_cpu_arm64 {
-    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
-        init_vars
-        write-host "Checking for clang..."
-        get-command clang
-        $env:CFLAGS="-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only"
-        $env:CXXFLAGS="$env:CFLAGS"
-        $env:LDFLAGS="-static-libstdc++"
-        $script:cmakeDefs = $script:commonCpuDefs + @(
-            "-DCMAKE_VERBOSE_MAKEFILE=on",
-            "-DCMAKE_C_COMPILER=clang.exe",
-            "-DCMAKE_CXX_COMPILER=clang++.exe",
-            "-DMSVC_RUNTIME_LIBRARY=MultiThreaded"
-        ) + $script:cmakeDefs
-        $script:buildDir="../build/windows/${script:ARCH}/cpu"
-        $script:distDir="$script:DIST_BASE\cpu"
-        write-host "Building LCD CPU"
-        build
-        sign
-        install
-    } else {
-        write-host "Skipping CPU generation step as requested"
-    }
-}
-
-
-function build_cpu_avx() {
-    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
-        init_vars
-        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=on", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
-        $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
-        $script:distDir="$script:DIST_BASE\cpu_avx"
-        write-host "Building AVX CPU"
-        build
-        sign
-        install
-    } else {
-        write-host "Skipping CPU AVX generation step as requested"
-    }
-}
-
-function build_cpu_avx2() {
-    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
-        init_vars
-        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=on", "-DGGML_AVX2=on", "-DGGML_AVX512=off", "-DGGML_FMA=on", "-DGGML_F16C=on") + $script:cmakeDefs
-        $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
-        $script:distDir="$script:DIST_BASE\cpu_avx2"
-        write-host "Building AVX2 CPU"
-        build
-        sign
-        install
-    } else {
-        write-host "Skipping CPU AVX2 generation step as requested"
-    }
-}
-
-function build_cuda() {
-    if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
-        # Then build cuda as a dynamically loaded library
-        $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
-        $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
-        if ($null -ne $script:CUDA_VERSION) {
-            $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
-        }
-        init_vars
-        $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
-        $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
-        $script:cmakeDefs += @(
-            "-A", "x64",
-            "-DGGML_CUDA=ON",
-            "-DGGML_AVX=on",
-            "-DGGML_AVX2=off",
-            "-DCMAKE_CUDA_FLAGS=-t6",
-            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
-            "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
-            )
-        if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
-            write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
-            $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
-            write-host "building custom CUDA GPU"
-        }
-        build
-        sign
-        install
-
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    } else {
-        write-host "Skipping CUDA generation step"
-    }
-}
-
-function build_oneapi() {
-  if ((-not "${env:OLLAMA_SKIP_ONEAPI_GENERATE}") -and ("${env:ONEAPI_ROOT}"))  {
-    # Get oneAPI version
-    $script:ONEAPI_VERSION = icpx --version
-    $script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?<version>\d+\.\d+\.\d+)').Value
-    if ($null -ne $script:ONEAPI_VERSION) {
-      $script:ONEAPI_VARIANT = "_v" + $script:ONEAPI_VERSION
-    }
-    init_vars
-    $script:buildDir = "../build/windows/${script:ARCH}/oneapi$script:ONEAPI_VARIANT"
-    $script:distDir ="$script:DIST_BASE\oneapi$script:ONEAPI_VARIANT"
-    $script:cmakeDefs += @(
-      "-G", "MinGW Makefiles",
-      "-DGGML_SYCL=ON",
-      "-DCMAKE_C_COMPILER=icx",
-      "-DCMAKE_CXX_COMPILER=icx",
-      "-DCMAKE_BUILD_TYPE=Release"
-    )
-
-    Write-Host "Building oneAPI"
-    build
-    # Ninja doesn't prefix with config name
-    if ($null -ne $script:DUMPBIN) {
-      & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | Select-String ".dll"
-    }
-    sign
-    install
-
-    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-  } else {
-    Write-Host "Skipping oneAPI generation step"
-  }
-}
-
-function build_rocm() {
-    if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
-        $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
-        if ($null -ne $script:ROCM_VERSION) {
-            $script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
-        }
-
-        init_vars
-        $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
-        $script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
-        $script:cmakeDefs += @(
-            "-G", "Ninja",
-            "-DCMAKE_C_COMPILER=clang.exe",
-            "-DCMAKE_CXX_COMPILER=clang++.exe",
-            "-DGGML_HIPBLAS=on",
-            "-DHIP_PLATFORM=amd",
-            "-DGGML_AVX=on",
-            "-DGGML_AVX2=off",
-            "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
-            "-DAMDGPU_TARGETS=$(amdGPUs)",
-            "-DGPU_TARGETS=$(amdGPUs)"
-            )
-
-        # Make sure the ROCm binary dir is first in the path
-        $env:PATH="$env:HIP_PATH\bin;$env:PATH"
-
-        # We have to clobber the LIB var from the developer shell for clang to work properly
-        $env:LIB=""
-        if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
-            write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
-            $script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
-            write-host "building custom ROCM GPU"
-        }
-        write-host "Building ROCm"
-        build
-        # Ninja doesn't prefix with config name
-        ${script:config}=""
-        if ($null -ne $script:DUMPBIN) {
-            & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
-        }
-        sign
-        install
-
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
-        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
-        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
-    } else {
-        write-host "Skipping ROCm generation step"
-    }
-}
-
-init_vars
-if ($($args.count) -eq 0) {
-    git_module_setup
-    apply_patches
-    if ($script:ARCH -eq "arm64") {
-        build_cpu_arm64
-    } else { # amd64
-        build_cpu_x64
-        build_cpu_avx
-        build_cpu_avx2
-        build_cuda
-        build_oneapi
-        build_rocm
-    }
-
-    cleanup
-    write-host "`ngo generate completed.  LLM runners: $(get-childitem -path $script:DIST_BASE)"
-} else {
-    for ( $i = 0; $i -lt $args.count; $i++ ) {
-        write-host "performing $($args[$i])"
-        & $($args[$i])
-    }
-}
--- a/llm/generate/generate_darwin.go
+++ b/llm/generate/generate_darwin.go
@ -1,3 +0,0 @@
-package generate
-
-//go:generate bash ./gen_darwin.sh
--- a/llm/generate/generate_linux.go
+++ b/llm/generate/generate_linux.go
@ -1,3 +0,0 @@
-package generate
-
-//go:generate bash ./gen_linux.sh
--- a/llm/generate/generate_windows.go
+++ b/llm/generate/generate_windows.go
@ -1,3 +0,0 @@
-package generate
-
-//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
--- a/llm/ggml.go
+++ b/llm/ggml.go
@ -360,7 +360,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	}, offset, nil
 }

-func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
+func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffload uint64) {
 	embedding := llm.KV().EmbeddingLength()
 	heads := llm.KV().HeadCount()
 	headsKV := llm.KV().HeadCountKV()
@ -368,9 +368,12 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui

 	embeddingHeads := llm.KV().EmbeddingHeadCount()
 	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
+	embeddingHeadsV := llm.KV().EmbeddingHeadCountV()

 	layers := llm.Tensors().Layers()

+	kv = 2 * context * llm.KV().BlockCount() * (embeddingHeadsK + embeddingHeadsV) * headsKV
+
 	switch llm.KV().Architecture() {
 	case "llama":
 		fullOffload = max(
@ -400,6 +403,42 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
 			)
 		}
+	case "mllama":
+		var visionTokens, tiles uint64 = 1601, 4
+
+		if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
+			kv = headsKV *
+				(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
+				(2* // sizeof(float16)
+					(llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
+					context +
+					4* // sizeof(float32)
+						uint64(crossAttentionLayers.size)* // num cross attention layers
+						visionTokens*
+						tiles)
+		}
+
+		fullOffload = max(
+			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
+			// vocab graph
+			4*batch*(embedding+vocab),
+		)
+
+		var ropeFreqsCount uint64
+		if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok {
+			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
+				ropeFreqsCount = ropeFreqsWeights.parameters()
+			}
+		}
+
+		partialOffload = max(
+			4*(batch*
+				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
+				ropeFreqsCount+
+				embeddingHeadsK*context*headsKV),
+			// vocab graph
+			4*batch*(embedding+vocab)+embedding*vocab*105/128,
+		)
 	case "gemma", "gemma2":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@ -1 +0,0 @@
-Subproject commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555
--- a/llm/memory.go
+++ b/llm/memory.go
@ -123,13 +123,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		slog.Warn("model missing blk.0 layer size")
 	}

-	// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
-	var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
-
-	// KV is proportional to the number of layers
-	layerSize += kv / ggml.KV().BlockCount()
-
-	graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
+	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
 	if graphPartialOffload == 0 {
 		graphPartialOffload = ggml.KV().GQA() * kv / 6
 	}
@ -137,6 +131,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		graphFullOffload = graphPartialOffload
 	}

+	// KV is proportional to the number of layers
+	layerSize += kv / ggml.KV().BlockCount()
+
 	// on metal there's no partial offload overhead
 	if gpus[0].Library == "metal" {
 		graphPartialOffload = graphFullOffload
--- a/llm/patches/0000-cmakelist.patch
+++ b/llm/patches/0000-cmakelist.patch
@ -1,22 +0,0 @@
-From 7a3555098d4591c9b329c677654497ed8cee07ec Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Fri, 23 Aug 2024 11:27:48 -0700
-Subject: [PATCH] patch cmakelist
-
---
- CMakeLists.txt | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 415743c2..aaadd13e 100644
--- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -210,3 +210,5 @@ if (LLAMA_BUILD_EXAMPLES)
-     add_subdirectory(examples)
-     add_subdirectory(pocs)
- endif()
-+
-+add_subdirectory(../ext_server ext_server) # ollama
-- 
-2.39.3 (Apple Git-146)
-
--- a/llm/patches/0001-load-progress.patch
+++ b/llm/patches/0001-load-progress.patch
@ -1,44 +0,0 @@
-From c97ed60c3369294d5551ba099a88ddc509687df1 Mon Sep 17 00:00:00 2001
-From: Gabe Goodhart <ghart@us.ibm.com>
-Date: Thu, 19 Sep 2024 16:55:15 -0600
-Subject: [PATCH] patch load progress
-
---
- common/common.cpp | 2 ++
- common/common.h   | 7 +++++++
- 2 files changed, 9 insertions(+)
-
-diff --git a/common/common.cpp b/common/common.cpp
-index 8d0ed4f9..a09e8a53 100644
--- a/common/common.cpp
-+++ b/common/common.cpp
-@@ -955,6 +955,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
-     mparams.use_mmap        = params.use_mmap;
-     mparams.use_mlock       = params.use_mlock;
-     mparams.check_tensors   = params.check_tensors;
-+    mparams.progress_callback = params.progress_callback;
-+    mparams.progress_callback_user_data = params.progress_callback_user_data;
-     if (params.kv_overrides.empty()) {
-         mparams.kv_overrides = NULL;
-     } else {
-diff --git a/common/common.h b/common/common.h
-index cb87c447..818a4a4a 100644
--- a/common/common.h
-+++ b/common/common.h
-@@ -266,6 +266,13 @@ struct gpt_params {
-     std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
-     std::vector<std::string> image; // path to image file(s)
- 
-+    // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
-+    // If the provided progress_callback returns true, model loading continues.
-+    // If it returns false, model loading is immediately aborted.
-+    llama_progress_callback progress_callback = NULL;
-+    // context pointer passed to the progress callback
-+    void * progress_callback_user_data;
-+
-     // embedding
-     bool embedding         = false; // get only sentence embedding
-     int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
-- 
-2.39.3 (Apple Git-146)
-
--- a/llm/patches/0002-clip-log.patch
+++ b/llm/patches/0002-clip-log.patch
@ -1,24 +0,0 @@
-From 6fdf4268e13e56f0050fa6a29b029cbd54be49d2 Mon Sep 17 00:00:00 2001
-From: Gabe Goodhart <ghart@us.ibm.com>
-Date: Thu, 19 Sep 2024 16:58:03 -0600
-Subject: [PATCH] clip log
-
---
- examples/llava/clip.cpp | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 8aa7b075..b8941c74 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -3,6 +3,7 @@
- // I'll gradually clean and extend it
- // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
- #include "clip.h"
-+#include "common.h"
- #include "ggml.h"
- #include "ggml-alloc.h"
- #include "ggml-backend.h"
-- 
-2.39.3 (Apple Git-146)
-
--- a/llm/patches/0003-load_exception.patch
+++ b/llm/patches/0003-load_exception.patch
@ -1,57 +0,0 @@
-From 4f2b9cd0f012c49f40d0784454864ad41ca418b2 Mon Sep 17 00:00:00 2001
-From: Gabe Goodhart <ghart@us.ibm.com>
-Date: Thu, 19 Sep 2024 17:00:28 -0600
-Subject: [PATCH] load exception
-
---
- src/llama.cpp | 25 ++++++++++++++++---------
- 1 file changed, 16 insertions(+), 9 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index af8afd84..4d1db3d5 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -8871,7 +8871,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
-         }
-     } catch (const std::exception & err) {
-         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
-        return -1;
-+        throw;
-     }
- 
-     // loading time will be recalculate after the first eval, so
-@@ -18675,16 +18675,23 @@ struct llama_model * llama_load_model_from_file(
-         }
-         model->rpc_servers.push_back(servers);
-     }
-    int status = llama_model_load(path_model, *model, params);
-    GGML_ASSERT(status <= 0);
-    if (status < 0) {
-        if (status == -1) {
-            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
-        } else if (status == -2) {
-            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
-+
-+    try {
-+        int status = llama_model_load(path_model, *model, params);
-+        GGML_ASSERT(status <= 0);
-+        if (status < 0) {
-+            if (status == -1) {
-+                LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
-+            } else if (status == -2) {
-+                LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
-+            }
-+            delete model;
-+            return nullptr;
-         }
-+    } catch (...) {
-+        LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
-         delete model;
-        return nullptr;
-+        throw;
-     }
- 
-     return model;
-- 
-2.39.3 (Apple Git-146)
-
--- a/llm/patches/0004-metal.patch
+++ b/llm/patches/0004-metal.patch
@ -1,57 +0,0 @@
-From 91d3f886f1645b38d9658c0e125603e8d5338146 Mon Sep 17 00:00:00 2001
-From: nobody <>
-Date: Tue, 1 Oct 2024 13:55:01 -0600
-Subject: [PATCH] metal
-
---
- ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
- 1 file changed, 13 insertions(+), 17 deletions(-)
-
-diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
-index 9da08fe2..3a433703 100644
--- a/ggml/src/ggml-metal.m
-+++ b/ggml/src/ggml-metal.m
-@@ -1720,27 +1720,23 @@ static void ggml_metal_encode_node(
-                 // to the matrix-vector kernel
-                 int ne11_mm_min = 1;
- 
-#if 0
-                 // the numbers below are measured on M2 Ultra for 7B and 13B models
-                 // these numbers do not translate to other devices or model sizes
-                 // TODO: need to find a better approach
-                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
-                            switch (src0t) {
-                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q4_0:
-                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-                                case GGML_TYPE_Q5_0:                          // not tested yet
-                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-                                default:             ne11_mm_min = 1;  break;
-                            }
-+                        switch (src0t) {
-+                            case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-+                            case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-+                            case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q4_0:
-+                            case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-+                            case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-+                            case GGML_TYPE_Q5_0:                          // not tested yet
-+                            case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-+                            case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-+                            default:             ne11_mm_min = 1;  break;
-                         }
-#endif
- 
-                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-- 
-2.39.3 (Apple Git-146)
-
--- a/llm/patches/0005-default-pretokenizer.patch
+++ b/llm/patches/0005-default-pretokenizer.patch
@ -1,44 +0,0 @@
-From 0e531d69786c4a96a3a2bcf7b2d576bd6f7edf25 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:13 -0700
-Subject: [PATCH] 05-default-pretokenizer.diff
-
---
- src/llama.cpp | 14 +++-----------
- 1 file changed, 3 insertions(+), 11 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 4c0a1bb6..800dfb95 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -6287,16 +6287,7 @@ static void llm_load_vocab(
-         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
-             vocab.tokenizer_add_space_prefix = false;
-             vocab.tokenizer_clean_spaces = true;
-            if (tokenizer_pre.empty()) {
-                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
-                LLAMA_LOG_WARN("%s:                                             \n", __func__);
-                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
-                LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
-                LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
-                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
-                LLAMA_LOG_WARN("%s:                                             \n", __func__);
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-            } else if (tokenizer_pre == "default") {
-+            if (tokenizer_pre == "default") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-             } else if (
-                     tokenizer_pre == "llama3"   ||
-@@ -6398,7 +6389,8 @@ static void llm_load_vocab(
-                 vocab.tokenizer_add_bos = true;
-                 vocab.tokenizer_clean_spaces = false;
-             } else {
-                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
-+                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
-+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-             }
-         } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
-             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-- 
-2.39.3 (Apple Git-146)
-
--- a/llm/patches/0006-embeddings.patch
+++ b/llm/patches/0006-embeddings.patch
@ -1,54 +0,0 @@
-From 235b6d876a74cb09abe26985fa89ebe5bfc9f562 Mon Sep 17 00:00:00 2001
-From: Gabe Goodhart <ghart@us.ibm.com>
-Date: Thu, 19 Sep 2024 17:06:17 -0600
-Subject: [PATCH] embeddings
-
---
- src/llama.cpp | 15 +++++++++------
- 1 file changed, 9 insertions(+), 6 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 1a8e0c51..e55ec3f8 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -16516,7 +16516,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
-     const auto n_embd  = hparams.n_embd;
- 
-     // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = !cparams.embeddings;
-+    const bool has_logits =  cparams.causal_attn;
-     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
- 
-     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -16794,20 +16794,23 @@ static int llama_decode_internal(
-             // no output
-             res  = nullptr;
-             embd = nullptr;
-        } else if (cparams.embeddings) {
-            res  = nullptr; // do not extract logits for embedding case
-            embd = nullptr;
-+        }
-+
-+        if (cparams.embeddings) {
-             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-+                embd = ggml_graph_node(gf, i);
-                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-                    embd = ggml_graph_node(gf, i);
-                     break;
-                 }
-             }
-            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
-         } else {
-             embd = nullptr; // do not extract embeddings when not needed
-             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
-         }
-+
-+        if (!cparams.causal_attn) {
-+            res = nullptr; // do not extract logits when not needed
-+        }
-         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
- 
-         ggml_backend_sched_alloc_graph(lctx.sched, gf);
-- 
-2.39.3 (Apple Git-146)
-
--- a/llm/patches/0007-clip-unicode.patch
+++ b/llm/patches/0007-clip-unicode.patch
@ -1,54 +0,0 @@
-From 01c42149cbdc194644a2f138598029938e0dd447 Mon Sep 17 00:00:00 2001
-From: Gabe Goodhart <ghart@us.ibm.com>
-Date: Thu, 19 Sep 2024 17:09:57 -0600
-Subject: [PATCH] clip unicode
-
---
- examples/llava/clip.cpp | 23 +++++++++++++++++++++++
- 1 file changed, 23 insertions(+)
-
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index b8941c74..3a735f17 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -40,6 +40,14 @@
- #include <cinttypes>
- #include <limits>
- 
-+#if defined(_WIN32)
-+#define WIN32_LEAN_AND_MEAN
-+#ifndef NOMINMAX
-+    #define NOMINMAX
-+#endif
-+#include <windows.h>
-+#endif
-+
- #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
- #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
- #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-@@ -1227,7 +1235,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-             return nullptr;
-         }
- 
-+#ifdef _WIN32
-+        int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
-+        if (!wlen) {
-+            return NULL;
-+        }
-+        wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
-+        wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
-+        if (!wlen) {
-+            free(wbuf);
-+            return NULL;
-+        }
-+        auto fin = std::ifstream(wbuf, std::ios::binary);
-+        free(wbuf);
-+#else
-         auto fin = std::ifstream(fname, std::ios::binary);
-+#endif
-         if (!fin) {
-             LOG_ERR("cannot open model file for loading tensors\n");
-             clip_free(new_clip);
-- 
-2.39.3 (Apple Git-146)
-
--- a/llm/patches/0008-solar-pro.patch
+++ b/llm/patches/0008-solar-pro.patch
@ -1,412 +0,0 @@
-From a8fe40fa7b026d2db9bb6aeecd24fcd2027110ec Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:16 -0700
-Subject: [PATCH] add solar-pro support
-
-solar-pro introduces block skip connections where blocks are connected
-to other, non-sequential blocks with a scale multiple
-
-this change adds 4 new keys to store the skip connections and one new
-tensor to store the scalar. the scalar is implemented a 1-dimensional
-tensor with 2 elements dervied from the model's bskcn_tv configuration.
-in general, the values are (bskcn_tv, 1 - bskcn_tv)
---
- src/llama.cpp | 270 +++++++++++++++++++++++++++++++++++++++++++++++---
- 1 file changed, 255 insertions(+), 15 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 4c0a1bb6..c6fc0c3f 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -217,6 +217,7 @@ enum llm_arch {
-     LLM_ARCH_GRANITE,
-     LLM_ARCH_GRANITE_MOE,
-     LLM_ARCH_CHAMELEON,
-+    LLM_ARCH_SOLAR,
-     LLM_ARCH_UNKNOWN,
- };
- 
-@@ -270,6 +271,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_GRANITE,         "granite"      },
-     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
-     { LLM_ARCH_CHAMELEON,       "chameleon"    },
-+    { LLM_ARCH_SOLAR,           "solar"        },
-     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
- };
- 
-@@ -327,6 +329,7 @@ enum llm_kv {
-     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
-     LLM_KV_ATTENTION_SLIDING_WINDOW,
-     LLM_KV_ATTENTION_SCALE,
-+    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
- 
-     LLM_KV_ROPE_DIMENSION_COUNT,
-     LLM_KV_ROPE_FREQ_BASE,
-@@ -421,20 +424,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
-     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
- 
-    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
-    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
-    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"         },
-    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"              },
-    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"             },
-    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
-    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
-    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
-    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
-    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
-    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
-    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
-    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
-    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
-+    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"               },
-+    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"            },
-+    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"           },
-+    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"                },
-+    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"               },
-+    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"             },
-+    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"       },
-+    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon"   },
-+    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                   },
-+    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"              },
-+    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"             },
-+    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count"   },
-+    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"           },
-+    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                    },
-+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
- 
-     { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
-     { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
-@@ -608,6 +612,7 @@ enum llm_tensor {
-     LLM_TENSOR_ENC_OUTPUT_NORM,
-     LLM_TENSOR_CLS,
-     LLM_TENSOR_CLS_OUT,
-+    LLM_TENSOR_BSKCN_TV,
- };
- 
- static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
-@@ -1527,6 +1532,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
-             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
-         },
-     },
-+
-+    {
-+        LLM_ARCH_SOLAR,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-+            { LLM_TENSOR_OUTPUT,          "output" },
-+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-+            { LLM_TENSOR_BSKCN_TV,        "bskcn_tv" },
-+        },
-+    },
-     {
-         LLM_ARCH_UNKNOWN,
-         {
-@@ -2360,6 +2384,7 @@ enum e_model {
-     MODEL_15B,
-     MODEL_16B,
-     MODEL_20B,
-+    MODEL_22B,
-     MODEL_30B,
-     MODEL_34B,
-     MODEL_35B,
-@@ -2409,6 +2434,8 @@ struct llama_hparams {
-     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
-     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
- 
-+    std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
-+
-     uint32_t n_layer_dense_lead = 0;
-     uint32_t n_lora_q = 0;
-     uint32_t n_lora_kv = 0;
-@@ -2479,6 +2506,7 @@ struct llama_hparams {
-         if (this->n_head_arr    != other.n_head_arr)    return true;
-         if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
-         if (this->n_ff_arr      != other.n_ff_arr)      return true;
-+        if (this->n_bskcn_arr   != other.n_bskcn_arr)   return true;
- 
-         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
-         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-@@ -2588,6 +2616,14 @@ struct llama_hparams {
-             return ssm_d_state * ssm_d_inner;
-         }
-     }
-+
-+    bool n_bskcn(uint32_t n, uint32_t il = 0) const {
-+        if (il < n_layer) {
-+            return n_bskcn_arr[n][il] > 0;
-+        }
-+
-+        GGML_ABORT("fatal error");
-+    }
- };
- 
- static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2769,6 +2805,8 @@ struct llama_layer {
-     struct ggml_tensor * ffn_gate_scale;
-     struct ggml_tensor * ffn_up_scale;
-     struct ggml_tensor * ffn_down_scale;
-+
-+    struct ggml_tensor * bskcn_tv;
- };
- 
- // very similar to llama_batch,
-@@ -6134,6 +6172,21 @@ static void llm_load_hparams(
-                     default: model.type = e_model::MODEL_UNKNOWN;
-                }
-             } break;
-+        case LLM_ARCH_SOLAR:
-+            {
-+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-+
-+                for (int i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
-+                    auto & bskcn = hparams.n_bskcn_arr.at(i);
-+                    bskcn.fill(0);
-+                    ml.get_key_or_arr(::format(LLM_KV_NAMES.at(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION), LLM_ARCH_NAMES.at(ml.llm_kv.arch), i), bskcn, hparams.n_layer, false);
-+                }
-+
-+                switch (hparams.n_layer) {
-+                    case 64: model.type = e_model::MODEL_22B; break;
-+                    default: model.type = e_model::MODEL_UNKNOWN;
-+                }
-+            }
-         default: (void)0;
-     }
- 
-@@ -8839,6 +8892,37 @@ static bool llm_load_tensors(
- 
-                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- 
-+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-+                    }
-+                } break;
-+            case LLM_ARCH_SOLAR:
-+                {
-+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-+
-+                    // output
-+                    {
-+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-+                    }
-+
-+                    for (int i = 0; i < n_layer; ++i) {
-+                        ggml_context * ctx_layer = ctx_for_layer(i);
-+                        ggml_context * ctx_split = ctx_for_layer_split(i);
-+
-+                        auto & layer = model.layers[i];
-+
-+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
-+
-+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-+
-+                        layer.bskcn_tv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+
-                         layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-@@ -16009,7 +16093,6 @@ struct llm_build_context {
- 
-         return gf;
-     }
-
-     // ref: https://github.com/facebookresearch/chameleon
-     // based on the original build_llama() function, changes:
-     //   * qk-norm
-@@ -16187,6 +16270,158 @@ struct llm_build_context {
- 
-         return gf;
-     }
-+
-+    ggml_cgraph * build_solar() {
-+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
-+
-+        // mutable variable, needed during the last layer of the computation to skip unused tokens
-+        int32_t n_tokens = this->n_tokens;
-+
-+        const int64_t n_embd_head = hparams.n_embd_head_v;
-+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-+        GGML_ASSERT(n_embd_head == hparams.n_rot);
-+
-+        struct ggml_tensor * cur;
-+        struct ggml_tensor * inpL;
-+
-+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
-+
-+        // inp_pos - contains the positions
-+        struct ggml_tensor * inp_pos = build_inp_pos();
-+
-+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
-+
-+        struct ggml_tensor * bskcn_1;
-+        struct ggml_tensor * bskcn_2;
-+
-+        for (int il = 0; il < n_layer; ++il) {
-+            struct ggml_tensor * inpSA = inpL;
-+
-+            if (hparams.n_bskcn(0, il)) {
-+                bskcn_1 = inpSA;
-+            }
-+
-+            if (hparams.n_bskcn(1, il)) {
-+                bskcn_2 = inpSA;
-+            }
-+
-+            if (hparams.n_bskcn(2, il)) {
-+                inpSA = ggml_add(
-+                   ctx0,
-+                   ggml_mul(ctx0, bskcn_1, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
-+                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
-+            }
-+
-+            if (hparams.n_bskcn(3, il)) {
-+                inpSA = ggml_add(
-+                   ctx0,
-+                   ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
-+                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
-+            }
-+
-+            // norm
-+            cur = llm_build_norm(ctx0, inpL, hparams,
-+                    model.layers[il].attn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
-+            cb(cur, "attn_norm", il);
-+
-+            // self-attention
-+            {
-+                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                struct ggml_tensor * rope_factors = build_rope_factors(il);
-+
-+                // compute Q and K and RoPE them
-+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-+                cb(Qcur, "Qcur", il);
-+                if (model.layers[il].bq) {
-+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-+                    cb(Qcur, "Qcur", il);
-+                }
-+
-+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
-+                cb(Kcur, "Kcur", il);
-+                if (model.layers[il].bk) {
-+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-+                    cb(Kcur, "Kcur", il);
-+                }
-+
-+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
-+                cb(Vcur, "Vcur", il);
-+                if (model.layers[il].bv) {
-+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-+                    cb(Vcur, "Vcur", il);
-+                }
-+
-+                Qcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
-+                cb(Qcur, "Qcur", il);
-+
-+                Kcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
-+                cb(Kcur, "Kcur", il);
-+
-+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-+                        model.layers[il].wo, model.layers[il].bo,
-+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-+            }
-+
-+            if (il == n_layer - 1) {
-+                // skip computing output for unused tokens
-+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-+                n_tokens = n_outputs;
-+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-+            }
-+
-+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-+            cb(ffn_inp, "ffn_inp", il);
-+
-+            // feed-forward network
-+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
-+                    model.layers[il].ffn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
-+            cb(cur, "ffn_norm", il);
-+
-+            cur = llm_build_ffn(ctx0, lctx, cur,
-+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-+                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-+                    NULL,
-+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-+            cb(cur, "ffn_out", il);
-+
-+            cur = ggml_add(ctx0, cur, ffn_inp);
-+            cb(cur, "ffn_out", il);
-+
-+            cur = lctx.cvec.apply_to(ctx0, cur, il);
-+            cb(cur, "l_out", il);
-+
-+            // input for next layer
-+            inpL = cur;
-+        }
-+
-+        cur = inpL;
-+
-+        cur = llm_build_norm(ctx0, cur, hparams,
-+                model.output_norm, NULL,
-+                LLM_NORM_RMS, cb, -1);
-+        cb(cur, "result_norm", -1);
-+
-+        // lm_head
-+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-+        cb(cur, "result_output", -1);
-+
-+        ggml_build_forward_expand(gf, cur);
-+
-+        return gf;
-+    }
- };
- 
- static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-@@ -16451,6 +16686,10 @@ static struct ggml_cgraph * llama_build_graph(
-             {
-                 result = llm.build_chameleon();
-             } break;
-+        case LLM_ARCH_SOLAR:
-+            {
-+                result = llm.build_solar();
-+            } break;
-         default:
-             GGML_ABORT("fatal error");
-     }
-@@ -19594,6 +19833,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_GRANITE:
-         case LLM_ARCH_GRANITE_MOE:
-         case LLM_ARCH_CHAMELEON:
-+        case LLM_ARCH_SOLAR:
-             return LLAMA_ROPE_TYPE_NORM;
- 
-         // the pairs of head values are offset by n_rot/2
-- 
-2.39.3 (Apple Git-146)
-
--- a/llm/server.go
+++ b/llm/server.go
@ -186,7 +186,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		"--model", model,
 		"--ctx-size", strconv.Itoa(opts.NumCtx),
 		"--batch-size", strconv.Itoa(opts.NumBatch),
-		"--embedding",
 	}

 	if opts.NumGPU >= 0 {
@ -218,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		params = append(params, "--threads", strconv.Itoa(defaultThreads))
 	}

-	if !opts.F16KV {
-		params = append(params, "--memory-f32")
-	}
-
 	flashAttnEnabled := envconfig.FlashAttention()

 	for _, g := range gpus {
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@ -440,7 +440,6 @@ func TestParseFileParameters(t *testing.T) {
 		"num_gpu 1":                    {"num_gpu", "1"},
 		"main_gpu 1":                   {"main_gpu", "1"},
 		"low_vram true":                {"low_vram", "true"},
-		"f16_kv true":                  {"f16_kv", "true"},
 		"logits_all true":              {"logits_all", "true"},
 		"vocab_only true":              {"vocab_only", "true"},
 		"use_mmap true":                {"use_mmap", "true"},
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@ -6,23 +6,18 @@ set -e

 mkdir -p dist

+# These require Xcode v13 or older to target MacOS v11
+# If installed to an alternate location use the following to enable
+# export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
+# export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
+export CGO_CFLAGS=-mmacosx-version-min=11.3
+export CGO_CXXFLAGS=-mmacosx-version-min=11.3
+export CGO_LDFLAGS=-mmacosx-version-min=11.3
+
 for TARGETARCH in arm64 amd64; do
-    if [ -n "${OLLAMA_NEW_RUNNERS}" ]; then
    echo "Building Go runner darwin $TARGETARCH"
    rm -rf llama/build
    GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
-    else
-        echo "Building C++ runner darwin $TARGETARCH"
-        rm -rf llm/build
-        GOOS=darwin GOARCH=$TARGETARCH go generate ./...
-    fi
-    # These require Xcode v13 or older to target MacOS v11
-    # If installed to an alternate location use the following to enable
-    # export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
-    # export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
-    export CGO_CFLAGS=-mmacosx-version-min=11.3
-    export CGO_CXXFLAGS=-mmacosx-version-min=11.3
-    export CGO_LDFLAGS=-mmacosx-version-min=11.3
    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
 done
--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
@ -19,7 +19,7 @@ docker buildx build \
    ${LOAD_OR_PUSH} \
    --platform=${PLATFORM} \
    ${OLLAMA_COMMON_BUILD_ARGS} \
-    -f ${DOCKERFILE_DIR}Dockerfile \
+    -f Dockerfile \
    -t ${FINAL_IMAGE_REPO}:$VERSION \
    .

@ -29,7 +29,7 @@ if echo $PLATFORM | grep "amd64" > /dev/null; then
        --platform=linux/amd64 \
        ${OLLAMA_COMMON_BUILD_ARGS} \
        --target runtime-rocm \
-        -f ${DOCKERFILE_DIR}Dockerfile \
+        -f Dockerfile \
        -t ${FINAL_IMAGE_REPO}:$VERSION-rocm \
        .
 fi
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@ -19,7 +19,7 @@ docker buildx build \
        --platform=${PLATFORM} \
        ${OLLAMA_COMMON_BUILD_ARGS} \
        --target dist \
-        -f ${DOCKERFILE_DIR}Dockerfile \
+        -f Dockerfile \
        .

 # buildx behavior changes for single vs. multiplatform
--- a/scripts/build_remote.py
+++ b/scripts/build_remote.py
@ -1,76 +0,0 @@
-#!/usr/bin/env python3
-import subprocess
-import sys
-from urllib.parse import urlparse
-from git import Repo
-
-# Helper script to be able to build on remote repos using git to push local changes
-# (e.g. particularly helpful to target a remote windows build system)
-#
-# Typical windows remote git config looks like this:
-#
-#[remote "windows-pa"]
-#        url = jdoe@desktop-foo:C:/Users/Jdoe/code/ollama
-#        fetch = +refs/heads/*:refs/remotes/windows-pa/*
-#        uploadpack = powershell git upload-pack
-#        receivepack = powershell git receive-pack
-#
-
-# TODO - add argpare and make this more configurable 
-# - force flag becomes optional
-# - generate, build or test ...
-
-# Note: remote repo will need this run once:
-# git config --local receive.denyCurrentBranch updateInstead
-repo = Repo(".")
-
-# On linux, add links in /usr/local/bin to the go binaries to avoid needing this
-# GoCmd = "/usr/local/go/bin/go" 
-GoCmd = "go" 
-
-if repo.is_dirty():
-    print("Tree is dirty.  Commit your changes before running this script")
-    sys.exit(1)
-
-if len(sys.argv) != 2:
-    print("Please specify the remote name: " + ', '.join([r.name for r in repo.remotes]))
-    sys.exit(1)
-remote_name = sys.argv[1]
-
-remote = {r.name: r for r in repo.remotes}[remote_name]
-raw_url = list(remote.urls)[0]
-url = urlparse(raw_url)
-# Windows urls don't quite parse properly
-if url.scheme == "" and url.netloc == "":
-    url = urlparse("ssh://" + raw_url)
-print("URL: " + str(url))
-netloc = url.netloc.split(":")[0]
-path = url.path
-branch_name = repo.active_branch.name
-
-print("Force pushing content to remote...")
-# Use with care given the force push
-remote.push(force=True).raise_if_error()
-
-print("Ensuring correct branch checked out on remote via ssh...")
-subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'git', 'checkout', branch_name])
-
-
-# TODO - add some hardening to try to figure out how to set up the path properly
-# subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'env'])
-# TODO - or consider paramiko maybe
-
-print("Running Windows Build Script")
-subprocess.check_call(['ssh', netloc, 'cd', path, ';', "powershell", "-ExecutionPolicy", "Bypass", "-File", "./scripts/build_windows.ps1"])
-
-# print("Building")
-# subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.'])
-
-print("Copying built result")
-subprocess.check_call(['scp', netloc +":"+ path + "/ollama.exe",  './dist/'])
-
-print("Copying installer")
-subprocess.check_call(['scp', netloc +":"+ path + "/dist/Ollama Setup.exe",  './dist/'])
-
-
-
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@ -75,7 +75,6 @@ function checkEnv() {
    } else {
        write-host "Code signing disabled - please set KEY_CONTAINERS to sign and copy ollama_inc.crt to the top of the source tree"
    }
-
 }


@ -83,50 +82,7 @@ function buildOllama() {
    if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
        write-host "Building ollama runners"
        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
-        if ($null -eq ${env:OLLAMA_NEW_RUNNERS}) {
-            # Start by skipping CUDA to build everything else
-            write-host "Building ollama runners"
-            powershell -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
-
-            # Then skip everyhting else and build all the CUDA variants
-            foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
-                write-host "Building CUDA ${env:CUDA_LIB_DIR} runner"
-
-                if ($env:CUDA_LIB_DIR.Contains("v12")) {
-                    powershell -Command {
-                        $env:OLLAMA_SKIP_CUDA_GENERATE=""
-                        $env:OLLAMA_SKIP_STATIC_GENERATE="1"
-                        $env:OLLAMA_SKIP_CPU_GENERATE="1"
-                        $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
-                        $env:OLLAMA_SKIP_ROCM_GENERATE="1"
-                        $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
-                        $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on"
-                        $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
-                        $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
-                        & go generate ./...
-                    }
-                } else {
-                    powershell -Command {
-                        $env:OLLAMA_SKIP_CUDA_GENERATE=""
-                        $env:OLLAMA_SKIP_STATIC_GENERATE="1"
-                        $env:OLLAMA_SKIP_CPU_GENERATE="1"
-                        $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
-                        $env:OLLAMA_SKIP_ROCM_GENERATE="1"
-                        $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
-                        $env:OLLAMA_CUSTOM_CUDA_DEFS=""
-                        $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
-                        $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
-                        & go generate ./...
-                    }
-                }
-                if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            }
-        } else {
        & make -C llama -j 12
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
-        
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    } else {
        write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
@ -134,11 +90,6 @@ function buildOllama() {
    write-host "Building ollama CLI"
    & go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    if ("${env:KEY_CONTAINER}") {
-        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
-            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
-        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    }
    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
 }
@ -149,11 +100,6 @@ function buildApp() {
    & windres -l 0 -o ollama.syso ollama.rc
    & go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" -o "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}-app.exe" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    if ("${env:KEY_CONTAINER}") {
-        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
-            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}-app.exe"
-        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    }
 }

 function gatherDependencies() {
@ -172,7 +118,7 @@ function gatherDependencies() {
    } else {
        $depArch=$script:TARGET_ARCH
    }
-    if ($depArch -eq "amd64") {
+    if ($depArch -eq "x64") {
        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DIST_DIR}\lib\ollama\"
        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DIST_DIR}\lib\ollama\"
        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DIST_DIR}\lib\ollama\"
@ -186,16 +132,19 @@ function gatherDependencies() {
        copy-item -path "${env:VCToolsRedistDir}\vc_redist.arm64.exe" -destination "${script:DIST_DIR}" -verbose
    }

-
    cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
+}
+
+function sign() {
    if ("${env:KEY_CONTAINER}") {
-        write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DIST_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
-            write-host "signing $file"
+        write-host "Signing Ollama executables, scripts and libraries"
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
-                /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
+            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} `
+            $(get-childitem -path "${script:SRC_DIR}\dist" -r -include @('ollama_welcome.ps1')) `
+            $(get-childitem -path "${script:SRC_DIR}\dist\windows-*" -r -include @('*.exe', '*.dll'))
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
+    } else {
+        write-host "Signing not enabled"
    }
 }

@ -226,6 +175,7 @@ try {
        buildOllama
        buildApp
        gatherDependencies
+        sign
        buildInstaller
        distZip
    } else {
--- a/scripts/env.sh
+++ b/scripts/env.sh
@ -20,12 +20,6 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
    --build-arg=CUSTOM_CPU_FLAGS \
    --build-arg=GPU_RUNNER_CPU_FLAGS \
    --build-arg=AMDGPU_TARGETS"
-OLLAMA_NEW_RUNNERS=${OLLAMA_NEW_RUNNERS:-""}
-if [ -n "${OLLAMA_NEW_RUNNERS}" ]; then
-    DOCKERFILE_DIR="./llama/"
-else
-    DOCKERFILE_DIR="./"
-fi

 echo "Building Ollama"
 echo "VERSION=$VERSION"
--- a/server/imageproc/images_test.go
+++ b/server/imageproc/images_test.go
@ -120,6 +120,78 @@ func TestGetOptimalTiledCanvas(t *testing.T) {
 			TileSize:      560,
 			Expected:      image.Point{1120, 1120},
 		},
+		{
+			ImageSize:     image.Point{800, 600},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{1120, 1120},
+		},
+		{
+			ImageSize:     image.Point{640, 480},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{1120, 560},
+		},
+		{
+			ImageSize:     image.Point{320, 200},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{560, 560},
+		},
+		{
+			ImageSize:     image.Point{1320, 200},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{1680, 560},
+		},
+		{
+			ImageSize:     image.Point{2000, 200},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{2240, 560},
+		},
+		{
+			ImageSize:     image.Point{10000, 200},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{2240, 560},
+		},
+		{
+			ImageSize:     image.Point{480, 640},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{560, 1120},
+		},
+		{
+			ImageSize:     image.Point{200, 320},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{560, 560},
+		},
+		{
+			ImageSize:     image.Point{200, 1320},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{560, 1680},
+		},
+		{
+			ImageSize:     image.Point{200, 2000},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{560, 2240},
+		},
+		{
+			ImageSize:     image.Point{200, 10000},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{560, 2240},
+		},
+		{
+			ImageSize:     image.Point{10000, 10000},
+			MaxImageTiles: 4,
+			TileSize:      560,
+			Expected:      image.Point{1120, 1120},
+		},
 	}

 	for _, c := range cases {
--- a/server/images.go
+++ b/server/images.go
@ -690,7 +690,8 @@ func CopyModel(src, dst model.Name) error {
 }

 func deleteUnusedLayers(deleteMap map[string]struct{}) error {
-	manifests, err := Manifests()
+	// Ignore corrupt manifests to avoid blocking deletion of layers that are freshly orphaned
+	manifests, err := Manifests(true)
 	if err != nil {
 		return err
 	}
@ -853,8 +854,8 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	manifest, _, err := GetManifest(mp)
 	if errors.Is(err, os.ErrNotExist) {
 		// noop
-	} else if err != nil && !errors.Is(err, os.ErrNotExist) {
-		return err
+	} else if err != nil {
+		slog.Warn("pulling model with bad existing manifest", "name", name, "error", err)
 	} else {
 		for _, l := range manifest.Layers {
 			deleteMap[l.Digest] = struct{}{}
--- a/server/layer.go
+++ b/server/layer.go
@ -106,7 +106,8 @@ func (l *Layer) Remove() error {
 		return nil
 	}

-	ms, err := Manifests()
+	// Ignore corrupt manifests to avoid blocking deletion of layers that are freshly orphaned
+	ms, err := Manifests(true)
 	if err != nil {
 		return err
 	}
--- a/server/manifest.go
+++ b/server/manifest.go
@ -123,7 +123,7 @@ func WriteManifest(name model.Name, config Layer, layers []Layer) error {
 	return json.NewEncoder(f).Encode(m)
 }

-func Manifests() (map[model.Name]*Manifest, error) {
+func Manifests(continueOnError bool) (map[model.Name]*Manifest, error) {
 	manifests, err := GetManifestPath()
 	if err != nil {
 		return nil, err
@ -145,22 +145,29 @@ func Manifests() (map[model.Name]*Manifest, error) {
 		if !fi.IsDir() {
 			rel, err := filepath.Rel(manifests, match)
 			if err != nil {
+				if !continueOnError {
+					return nil, fmt.Errorf("%s %w", match, err)
+				}
 				slog.Warn("bad filepath", "path", match, "error", err)
 				continue
 			}

 			n := model.ParseNameFromFilepath(rel)
 			if !n.IsValid() {
+				if !continueOnError {
+					return nil, fmt.Errorf("%s %w", rel, err)
+				}
 				slog.Warn("bad manifest name", "path", rel)
 				continue
 			}

 			m, err := ParseNamedManifest(n)
-			if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
+			if err != nil {
+				if !continueOnError {
+					return nil, fmt.Errorf("%s %w", n, err)
+				}
 				slog.Warn("bad manifest", "name", n, "error", err)
 				continue
-			} else if err != nil {
-				return nil, fmt.Errorf("%s: %w", n, err)
 			}

 			ms[n] = m
--- a/server/manifest_test.go
+++ b/server/manifest_test.go
@ -112,7 +112,7 @@ func TestManifests(t *testing.T) {
 				createManifest(t, d, p)
 			}

-			ms, err := Manifests()
+			ms, err := Manifests(true)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/server/prompt.go
+++ b/server/prompt.go
@ -27,6 +27,16 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.

 	isMllama := checkMllamaModelFamily(m)

+	var imageNumTokens int
+	// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
+	if isMllama {
+		// Our mllama implementation packs all of the embeddings into a single token
+		imageNumTokens = 1
+	} else {
+		// Clip images are represented as 768 tokens, each an embedding
+		imageNumTokens = 768
+	}
+
 	n := len(msgs) - 1
 	// in reverse, find all messages that fit into context window
 	for i := n; i >= 0; i-- {
@ -59,9 +69,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 		ctxLen := len(s)
 		if m.ProjectorPaths != nil {
 			for _, m := range msgs[i:] {
-				// images are represented as 768 sized embeddings
-				// TODO: get embedding length from project metadata
-				ctxLen += 768 * len(m.Images)
+				ctxLen += imageNumTokens * len(m.Images)
 			}
 		}

@ -75,11 +83,16 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.

 	currMsgIdx := n

+	for cnt, msg := range msgs[currMsgIdx:] {
+		prefix := ""
+		imgPrompt := ""
+		prompt := msg.Content
+
+		for _, i := range msg.Images {
+			var imgData llm.ImageData
+
 			if isMllama {
-		lastMsgIdx := len(msgs) - 1
-		for i := lastMsgIdx; i >= currMsgIdx; i-- {
-			if len(msgs[i].Images) > 0 {
-				data, aspectRatioID, err := imageproc.Preprocess(msgs[i].Images[0])
+				data, aspectRatioID, err := imageproc.Preprocess(i)
 				if err != nil {
 					return "", nil, err
 				}
@ -90,25 +103,19 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 					return "", nil, err
 				}

-				imgData := llm.ImageData{
+				imgData = llm.ImageData{
+					ID:            len(images),
 					Data:          buf.Bytes(),
 					AspectRatioID: aspectRatioID,
 				}
-
-				msgs[i].Content = strings.TrimSpace("<|image|>" + msgs[i].Content)
-				images = append(images, imgData)
-				break
-			}
-		}
+				imgPrompt = "<|image|>"
 			} else {
-		for cnt, msg := range msgs[currMsgIdx:] {
-			prefix := ""
-			prompt := msg.Content
-			for _, i := range msg.Images {
-				imgData := llm.ImageData{
+				imgData = llm.ImageData{
 					ID:   len(images),
 					Data: i,
 				}
+				imgPrompt = " "
+			}

 			imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
 			if !strings.Contains(prompt, "[img]") {
@ -119,8 +126,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.

 			images = append(images, imgData)
 		}
-			msgs[currMsgIdx+cnt].Content = strings.TrimSpace(prefix + " " + prompt)
-		}
+		msgs[currMsgIdx+cnt].Content = strings.TrimSpace(prefix + imgPrompt + prompt)
 	}

 	// truncate any messages that do not fit into the context window
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@ -249,7 +249,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
 			},
 			expect: expect{
-				prompt:        "<|image|>How many hotdogs are in this image? ",
+				prompt:        "[img-0]<|image|>How many hotdogs are in this image? ",
 				images:        [][]byte{imgBuf},
 				aspectRatioID: 1,
 			},
@ -264,7 +264,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
 			},
 			expect: expect{
-				prompt:        "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ",
+				prompt:        "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
 				images:        [][]byte{imgBuf},
 				aspectRatioID: 1,
 			},
@ -279,8 +279,8 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
 			},
 			expect: expect{
-				prompt:        "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ",
-				images:        [][]byte{imgBuf2},
+				prompt:        "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
+				images:        [][]byte{imgBuf, imgBuf2},
 				aspectRatioID: 1,
 			},
 		},
@ -294,7 +294,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "Which ones have mustard?"},
 			},
 			expect: expect{
-				prompt:        "<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
+				prompt:        "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
 				images:        [][]byte{imgBuf},
 				aspectRatioID: 1,
 			},
--- a/server/routes.go
+++ b/server/routes.go
@ -205,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				return
 			}

-			images[i] = llm.ImageData{Data: buf.Bytes(), AspectRatioID: aspectRatioID}
+			images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: aspectRatioID}
 		} else {
 			images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
 		}
@ -239,11 +239,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			}

 			for _, i := range images {
+				imgPrompt := ""
 				if isMllama {
-					msgs = append(msgs, api.Message{Role: "user", Content: "<|image|>"})
-				} else {
-					msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)})
+					imgPrompt = "<|image|>"
 				}
+				msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
 			}

 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
@ -267,7 +267,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}

-	slog.Debug("generate request", "prompt", prompt, "images", images)
+	slog.Debug("generate request", "images", len(images), "prompt", prompt)

 	ch := make(chan any)
 	go func() {
@ -622,7 +622,7 @@ func (s *Server) PushHandler(c *gin.Context) {
 }

 func checkNameExists(name model.Name) error {
-	names, err := Manifests()
+	names, err := Manifests(true)
 	if err != nil {
 		return err
 	}
@ -894,7 +894,7 @@ func getKVData(digest string, verbose bool) (llm.KV, error) {
 }

 func (s *Server) ListHandler(c *gin.Context) {
-	ms, err := Manifests()
+	ms, err := Manifests(true)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
@ -1211,6 +1211,9 @@ func Serve(ln net.Listener) error {
 	}

 	if !envconfig.NoPrune() {
+		if _, err := Manifests(false); err != nil {
+			slog.Warn("corrupt manifests detected, skipping prune operation.  Re-pull or delete to clear", "error", err)
+		} else {
 			// clean up unused layers and manifests
 			if err := PruneLayers(); err != nil {
 				return err
@ -1225,6 +1228,7 @@ func Serve(ln net.Listener) error {
 				return err
 			}
 		}
+	}

 	ctx, done := context.WithCancel(context.Background())
 	schedCtx, schedDone := context.WithCancel(ctx)
--- a/server/sched.go
+++ b/server/sched.go
@ -130,11 +130,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				continue
 			}
 			numParallel := int(envconfig.NumParallel())
-			// TODO (jmorganca): multimodal models don't support parallel yet
+			// TODO (jmorganca): mllama doesn't support parallel yet
 			// see https://github.com/ollama/ollama/issues/4165
-			if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
+			if checkMllamaModelFamily(pending.model) && numParallel != 1 {
 				numParallel = 1
-				slog.Warn("multimodal models don't support parallel requests yet")
+				slog.Warn("mllama doesn't support parallel requests yet")
 			}

 			for {
				`@ -0,0 +1 @@`
				`LLAMACPP_BASE_COMMIT=3f1ae2e32cde00c39b96be6d01c2997c29bae555`
				`@ -1 +0,0 @@`
				`Subproject commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555`