Merge https://github.com/ollama/ollama

Signed-off-by: baalajimaestro <baalajimaestro@ptr.moe>
2024-11-10 22:43:23 +05:30 · 2024-11-10 22:43:23 +05:30 · aa3a7dea34
commit aa3a7dea34
parent 77dfd8ba16 c2e8cbaa14
85 changed files with 3458 additions and 15884 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -3,9 +3,7 @@ ollama
 app
 macapp
 dist
 llm/llama.cpp
 .env
 .cache
 test_data
 llm/build
 llama/build
--- a/.gitattributes
+++ b/.gitattributes
@ -1,4 +1,3 @@
 llm/ext_server/* linguist-vendored
 llama/**/*.cpp linguist-vendored
 llama/**/*.hpp linguist-vendored
 llama/**/*.h linguist-vendored
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -1,5 +1,9 @@
 name: release
 env:
  ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
  MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
 on:
  push:
    tags:
@ -8,7 +12,7 @@ on:
 jobs:
  # Full build of the Mac assets
  build-darwin:
-    runs-on: macos-12
+    runs-on: macos-13
    environment: release
    steps:
      - uses: actions/checkout@v4
@ -39,8 +43,8 @@ jobs:
          APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
          APPLE_TEAM_ID: ${{ vars.APPLE_TEAM_ID }}
          APPLE_ID: ${{ vars.APPLE_ID }}
-          SDKROOT: /Applications/Xcode_13.4.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
+          SDKROOT: /Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
-          DEVELOPER_DIR: /Applications/Xcode_13.4.1.app/Contents/Developer
+          DEVELOPER_DIR: /Applications/Xcode_14.1.0.app/Contents/Developer
        run: |
          ./scripts/build_darwin.sh
@ -48,8 +52,8 @@ jobs:
        with:
          name: dist-darwin
          path: |
-            dist/*arwin*
+            dist/Ollama-darwin.zip
-            !dist/*-cov
+            dist/ollama-darwin
  # Windows builds take a long time to both install the dependencies and build, so parallelize
  # CPU generation step
@ -60,51 +64,34 @@ jobs:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
      - name: Set make jobs default
        run: |
          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
+      - name: Add msys paths
        with:
          project_id: 'ollama'
          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
      - name: install Windows SDK 8.1 to get signtool
        run: |
-          $ErrorActionPreference = "Stop"
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          write-host "downloading SDK"
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
+      - name: Install msys2 tools
          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
          write-host "Win SDK 8.1 installed"
          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
      - name: install signing plugin
        run: |
-          $ErrorActionPreference = "Stop"
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
          write-host "downloading plugin"
          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
      - run: go get ./...
      - run: |
-          $gopath=(get-command go).source | split-path -parent
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          cd $env:GITHUB_WORKSPACE
+          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          make
-          $env:PATH="$gopath;$env:PATH"
+        name: make
          go generate -x ./...
        name: go generate
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cpu
          path: |
            build/**/*
            build/**/*.a
            llm/build/**/*.a
            dist/windows-amd64/**
  # ROCm generation step
@ -115,74 +102,55 @@ jobs:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
      - name: Set make jobs default
        run: |
          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
+      - name: Add msys paths
        with:
          project_id: 'ollama'
          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
      - name: install Windows SDK 8.1 to get signtool
        run: |
-          $ErrorActionPreference = "Stop"
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          write-host "downloading SDK"
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
+      - name: Install msys2 tools
          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
          write-host "Win SDK 8.1 installed"
          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
      - name: install signing plugin
        run: |
-          $ErrorActionPreference = "Stop"
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
          write-host "downloading plugin"
          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
-      - name: 'Install ROCm'
+      # ROCM installation steps
      - name: 'Cache ROCm installer'
        id: cache-rocm
        uses: actions/cache@v4
        with:
          path: rocm-install.exe
          key: ${{ env.ROCM_WINDOWS_URL }}
      - name: 'Conditionally Download ROCm'
        if: steps.cache-rocm.outputs.cache-hit != 'true'
        run: |
          $ErrorActionPreference = "Stop"
-          write-host "downloading AMD HIP Installer"
+          Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+      - name: 'Install ROCm'
-          write-host "Installing AMD HIP"
+        run: |
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP"
      - name: 'Verify ROCm'
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-      - run: go get ./...
+          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-      - run: |
+      - name: make rocm runner
          $gopath=(get-command go).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          go generate -x ./...
        name: go generate
      - name: 'gather rocm dependencies'
        run: |
-          $HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          md "dist\deps\bin\rocblas\library"
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          cp "${HIP_PATH}\bin\hipblas.dll" "dist\deps\bin\"
+          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          cp "${HIP_PATH}\bin\rocblas.dll" "dist\deps\bin\"
+          make -C llama print-HIP_PATH print-HIP_LIB_DIR
-          cp "${HIP_PATH}\bin\rocblas\library\*" "dist\deps\bin\rocblas\library\"
+          make rocm
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-rocm
          path: |
            build/**/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
          name: windows-rocm-deps
          path: dist/deps/*
  # CUDA generation step
  generate-windows-cuda:
@ -191,88 +159,80 @@ jobs:
    strategy:
      matrix:
        cuda:
-          - version: "11"
+          - version: "11.3"
-            url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
+            url: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
-          - version: "12"
+          - version: "12.4"
-            url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
+            url: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
    env:
      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
    steps:
      - uses: actions/checkout@v4
      - name: Set make jobs default
        run: |
          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
      - name: Set Version
        shell: bash
        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
+      - name: Install msys2
        with:
          project_id: 'ollama'
          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
      - name: install Windows SDK 8.1 to get signtool
        run: |
-          $ErrorActionPreference = "Stop"
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
-          write-host "downloading SDK"
+          write-host "Downloading msys2"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
+          write-host "Installing msys2"
-          write-host "Win SDK 8.1 installed"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: install signing plugin
+      - name: Install msys2 tools
        run: |
-          $ErrorActionPreference = "Stop"
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
-          write-host "downloading plugin"
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
+      - name: verify tools
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
+        run: |
-          write-host "Installing plugin"
+          get-command gcc
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
+          gcc --version
-          write-host "plugin installed"
+          get-command make
          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
-      - name: 'Install CUDA ${{ matrix.cuda.version }}'
+      # CUDA installation steps
      - name: 'Cache CUDA installer'
        id: cache-cuda
        uses: actions/cache@v4
        with:
          path: cuda-install.exe
          key: ${{ matrix.cuda.url }}
      - name: 'Conditionally Download CUDA'
        if: steps.cache-cuda.outputs.cache-hit != 'true'
        run: |
          $ErrorActionPreference = "Stop"
-          write-host "downloading CUDA Installer"
+          Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "cuda-install.exe"
-          Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
+      - name: 'Install CUDA'
-          write-host "Installing CUDA"
+        run: |
-          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
+          $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ matrix.cuda.version }}"}
-          write-host "Completed CUDA"
+          Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
      - name: 'Verify CUDA'
        run: |
          & (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
-          echo "$cudaPath\bin" >> $env:GITHUB_PATH
+          echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
+          echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
+          echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
+          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-      - name: 'Verify CUDA'
+
-        run: nvcc -V
+      - name: make cuda runner
      - run: go get ./...
      - name: go generate
        run: |
-          $gopath=(get-command go).source | split-path -parent
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          $cudabin=(get-command nvcc).source | split-path
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          cd $env:GITHUB_WORKSPACE
+          make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$cudabin;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
          go generate -x ./...
      - name: 'gather cuda dependencies'
        run: |
          $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
          md "dist\deps"
          cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\"
          cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\"
          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
      - uses: actions/upload-artifact@v4
        with:
          name: generate-windows-cuda-${{ matrix.cuda.version }}
          path: |
            build/**/*
            dist/windows-amd64/**
      - uses: actions/upload-artifact@v4
        with:
          name: windows-cuda-deps-${{ matrix.cuda.version }}
          path: dist/deps/*
  # windows arm64 generate, go build, and zip file (no installer)
  # Output of this build is aggregated into the final x86 build
@ -292,6 +252,30 @@ jobs:
          choco install -y --no-progress git gzip
          echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      # pacman is buggy on win arm64, so we avoid using it, but rely on the binary artifacts
      # we download the sfx (7zip bundle) which isn't fully set up, but the binaries we need to build work
      - name: Install msys2 x64
        run: |
          $url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-base-x86_64-20240727.sfx.exe"
          write-host "Downloading MSYS2"
          Invoke-WebRequest -Uri "$url" -outfile "${env:RUNNER_TEMP}\msys2.exe"
          write-host "Installing msys2"
          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @(
              '-y', '-oC:\'
              ) -NoNewWindow -Wait
          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      # since pacman isn't reliable, we just download the tar file and extract directly
      - name: Downloading and extracting msys2 make tar file
        run: |
          $url="https://mirror.msys2.org/msys/x86_64/make-4.4.1-2-x86_64.pkg.tar.zst"
          write-host "Downloading make"
          Invoke-WebRequest -Uri "$url" -outfile c:\msys64\make.tar.zst
          cd c:\msys64; tar -xf make.tar.zst
          rm c:\msys64\make.tar.zst
      - name: Verify Make works properly
        run: |
          echo $env:PATH
          make --version
      - name: Install Visual Studio 2022
        run: |
          $components = @(
@ -385,13 +369,12 @@ jobs:
      - run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\Program Files\Microsoft Visual Studio\2022\Community' -skipautomaticlocation
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$gccpath;$env:PATH"
          $env:PATH="$gopath;$gccpath;$env:PATH;C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin"
          echo $env:PATH
          $env:ARCH="arm64"
-          .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies distZip
+          .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies sign distZip
        name: 'Windows Build'
      - uses: actions/upload-artifact@v4
        with:
@ -441,6 +424,24 @@ jobs:
          write-host "Installing plugin"
          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
          write-host "plugin installed"
      - name: Install msys2
        run: |
          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
          write-host "Downloading msys2"
          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
          write-host "Installing msys2"
          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Install msys2 tools
        run: |
          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: verify tools
        run: |
          get-command gcc
          gcc --version
          get-command make
          make --version
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
@ -451,19 +452,10 @@ jobs:
          name: generate-windows-cpu
      - uses: actions/download-artifact@v4
        with:
-          name: generate-windows-cuda-11
+          name: generate-windows-cuda-11.3
      - uses: actions/download-artifact@v4
        with:
-          name: generate-windows-cuda-12
+          name: generate-windows-cuda-12.4
      - uses: actions/download-artifact@v4
        with:
          name: windows-cuda-deps-11
      - uses: actions/download-artifact@v4
        with:
          name: windows-cuda-deps-12
      - uses: actions/download-artifact@v4
        with:
          name: windows-rocm-deps
      - uses: actions/download-artifact@v4
        with:
          name: generate-windows-rocm
@ -473,12 +465,11 @@ jobs:
          path: dist
      - run: dir build
      - run: |
-          $gopath=(get-command go).source | split-path -parent
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$env:PATH"
          $env:OLLAMA_SKIP_GENERATE="1"
          $env:ARCH="amd64"
          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
          & .\scripts\build_windows.ps1
      - uses: actions/upload-artifact@v4
        with:
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -1,5 +1,11 @@
 name: test
 env:
  ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
  MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
  CUDA_12_WINDOWS_URL: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
  CUDA_12_WINDOWS_VER: 12.4
 concurrency:
  # For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
  # cancels running CI jobs and starts all new ones.
@ -21,9 +27,6 @@ jobs:
  changes:
    runs-on: ubuntu-latest
    outputs:
      GENERATE: ${{ steps.changes.outputs.GENERATE }}
      GENERATE_CUDA: ${{ steps.changes.outputs.GENERATE_CUDA }}
      GENERATE_ROCM: ${{ steps.changes.outputs.GENERATE_ROCM }}
      RUNNERS: ${{ steps.changes.outputs.RUNNERS }}
    steps:
      - uses: actions/checkout@v4
@ -39,53 +42,12 @@ jobs:
          }
          {
            echo GENERATE=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
            echo GENERATE_CUDA=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
            echo GENERATE_ROCM=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
            echo RUNNERS=$(changed 'llama/**')
          } >>$GITHUB_OUTPUT
-  generate:
+  runners-linux-cuda:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-2019]
        arch: [amd64, arm64]
        exclude:
          - os: ubuntu-latest
            arch: arm64
          - os: windows-2019
            arch: arm64
    runs-on: ${{ matrix.os }}
    env:
      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: '1'
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
          go-version-file: go.mod
          cache: true
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$gccpath;$env:PATH"
          echo $env:PATH
          go generate -x ./...
        if: ${{ startsWith(matrix.os, 'windows-') }}
        name: 'Windows Go Generate'
      - run: go generate -x ./...
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
        name: 'Unix Go Generate'
      - run: go build .
  generate-cuda:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
    strategy:
      matrix:
        cuda-version:
@ -95,8 +57,6 @@ jobs:
    steps:
      - run: |
          apt-get update && apt-get install -y git build-essential curl
          curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
            | tar -zx -C /usr --strip-components 1
        env:
          DEBIAN_FRONTEND: noninteractive
      - uses: actions/checkout@v4
@ -107,12 +67,11 @@ jobs:
      - run: go get ./...
      - run: |
          git config --global --add safe.directory /__w/ollama/ollama
-          go generate -x ./...
+          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
-        env:
+          make -j $cores cuda_v11
-          OLLAMA_SKIP_CPU_GENERATE: '1'
+  runners-linux-rocm:
  generate-rocm:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    strategy:
      matrix:
        rocm-version:
@ -122,8 +81,6 @@ jobs:
    steps:
      - run: |
          apt-get update && apt-get install -y git build-essential curl rocm-libs
          curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
            | tar -zx -C /usr --strip-components 1
        env:
          DEBIAN_FRONTEND: noninteractive
      - uses: actions/checkout@v4
@ -134,14 +91,13 @@ jobs:
      - run: go get ./...
      - run: |
          git config --global --add safe.directory /__w/ollama/ollama
-          go generate -x ./...
+          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
-        env:
+          make -j $cores rocm
          OLLAMA_SKIP_CPU_GENERATE: '1'
  # ROCm generation step
-  generate-windows-rocm:
+  runners-windows-rocm:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    runs-on: windows
    steps:
      - uses: actions/checkout@v4
@ -149,35 +105,50 @@ jobs:
        with:
          go-version-file: go.mod
          cache: true
-      - name: 'Install ROCm'
+      - name: Set make jobs default
        run: |
          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
      # ROCM installation steps
      - name: 'Cache ROCm installer'
        id: cache-rocm
        uses: actions/cache@v4
        with:
          path: rocm-install.exe
          key: ${{ env.ROCM_WINDOWS_URL }}
      - name: 'Conditionally Download ROCm'
        if: steps.cache-rocm.outputs.cache-hit != 'true'
        run: |
          $ErrorActionPreference = "Stop"
-          write-host "downloading AMD HIP Installer"
+          Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+      - name: 'Install ROCm'
-          write-host "Installing AMD HIP"
+        run: |
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP"
      - name: 'Verify ROCm'
        run: |
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-      - run: go get ./...
+          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-      - run: |
+
-          $gopath=(get-command go).source | split-path -parent
+      - name: Add msys paths
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+        run: |
-          cd $env:GITHUB_WORKSPACE
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          $env:PATH="$gopath;$env:PATH"
+      - name: Install msys2 tools
-          $env:OLLAMA_SKIP_CPU_GENERATE="1"
+        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-          go generate -x ./...
+
-        name: go generate
+      - name: make rocm runner
-        env:
+        run: |
-          OLLAMA_SKIP_CPU_GENERATE: '1'
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
          make -C llama print-HIP_PATH print-HIP_LIB_DIR
          make rocm
  # CUDA generation step
-  generate-windows-cuda:
+  runners-windows-cuda:
    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    runs-on: windows
    steps:
      - uses: actions/checkout@v4
@ -185,37 +156,51 @@ jobs:
        with:
          go-version-file: go.mod
          cache: true
-      - name: 'Install CUDA'
+      - name: Set make jobs default
        run: |
          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
      # CUDA installation steps
      - name: 'Cache CUDA installer'
        id: cache-cuda
        uses: actions/cache@v4
        with:
          path: cuda-install.exe
          key: ${{ env.CUDA_12_WINDOWS_URL }}
      - name: 'Conditionally Download CUDA'
        if: steps.cache-cuda.outputs.cache-hit != 'true'
        run: |
          $ErrorActionPreference = "Stop"
-          write-host "downloading CUDA Installer"
+          Invoke-WebRequest -Uri "${env:CUDA_12_WINDOWS_URL}" -OutFile "cuda-install.exe"
-          Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
+      - name: 'Install CUDA'
-          write-host "Installing CUDA"
+        run: |
-          Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
+          $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ env.CUDA_12_WINDOWS_VER }}"}
-          write-host "Completed CUDA"
+          Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
      - name: 'Verify CUDA'
        run: |
          & (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
-          echo "$cudaPath\bin" >> $env:GITHUB_PATH
+          echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV
+          echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV
+          echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
+          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
      - name: 'Verify CUDA'
        run: nvcc -V
      - run: go get ./...
      - name: go generate
        run: |
          $gopath=(get-command go).source | split-path -parent
          $cudabin=(get-command nvcc).source | split-path
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$cudabin;$env:PATH"
          $env:OLLAMA_SKIP_CPU_GENERATE="1"
          go generate -x ./...
        env:
          OLLAMA_SKIP_CPU_GENERATE: '1'
-  runners:
+      - name: Add msys paths
        run: |
          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Install msys2 tools
        run: |
          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
      - name: make cuda runner
        run: |
          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
          make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
  runners-cpu:
    needs: [changes]
    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
    strategy:
@ -238,21 +223,30 @@ jobs:
        with:
          go-version-file: go.mod
          cache: true
-      - run: go get ./...
+      - name: Add msys paths
        if: ${{ startsWith(matrix.os, 'windows-') }}
        run: |
          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
      - name: Install msys2 tools
        if: ${{ startsWith(matrix.os, 'windows-') }}
        run: |
          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
      - name: 'Build Windows Go Runners'
        if: ${{ startsWith(matrix.os, 'windows-') }}
        run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          cd $env:GITHUB_WORKSPACE
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
          $env:PATH="$gopath;$gccpath;$env:PATH"
          echo $env:PATH
-          make -C llama -j 4      
+          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
          make -j 4      
      - name: 'Build Unix Go Runners'
        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        run: make -C llama -j 4
+        run: make -j 4
      - run: go build .
  lint:
@ -302,9 +296,6 @@ jobs:
    env:
      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: '1'
      OLLAMA_CPU_TARGET: 'static'
      OLLAMA_SKIP_CPU_GENERATE: '1'
      OLLAMA_SKIP_METAL_GENERATE: '1'
    steps:
      - uses: actions/checkout@v4
        with:
@ -319,7 +310,6 @@ jobs:
            arm64) echo ARCH=arm64 ;;
          esac >>$GITHUB_ENV
        shell: bash
      - run: go generate ./...
      - run: go build
      - run: go test -v ./...
@ -333,4 +323,4 @@ jobs:
          submodules: recursive
      - name: Verify patches carry all the changes
        run: |
-          cd llama && make apply-patches sync && git diff --compact-summary --exit-code .
+          make apply-patches sync && git diff --compact-summary --exit-code llama
--- a/.gitmodules
+++ b/.gitmodules
@ -1,4 +0,0 @@
 [submodule "llama.cpp"]
 	path = llm/llama.cpp
 	url = https://github.com/ggerganov/llama.cpp.git
 	shallow = true
--- a/4
+++ b/4
@ -0,0 +1,4 @@
 GOALS := $(or $(MAKECMDGOALS),all)
 .PHONY: $(GOALS)
 $(GOALS):
 	$(MAKE) -C llama $@
--- a/README.md
+++ b/README.md
@ -12,7 +12,7 @@ Get up and running with large language models.
 [Download](https://ollama.com/download/Ollama-darwin.zip)
-### Windows preview
+### Windows
 [Download](https://ollama.com/download/OllamaSetup.exe)
@ -331,6 +331,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate]((https://github.com/rapidarchitect/reddit_analyzer)) (Search and Rate Reddit topics with a weighted summation)
 ### Terminal
@ -454,6 +456,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
 - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
 - [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
 - [TextCraft](https://github.com/suncloudsmoon/TextCraft) (Copilot in Word alternative using Ollama)
 ### Supported backends
--- a/api/types.go
+++ b/api/types.go
@ -236,7 +236,7 @@ type Runner struct {
 	NumGPU    int   `json:"num_gpu,omitempty"`
 	MainGPU   int   `json:"main_gpu,omitempty"`
 	LowVRAM   bool  `json:"low_vram,omitempty"`
-	F16KV     bool  `json:"f16_kv,omitempty"`
+	F16KV     bool  `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
 	LogitsAll bool  `json:"logits_all,omitempty"`
 	VocabOnly bool  `json:"vocab_only,omitempty"`
 	UseMMap   *bool `json:"use_mmap,omitempty"`
@ -613,7 +613,6 @@ func DefaultOptions() Options {
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
 			NumThread: 0,  // let the runtime decide
 			LowVRAM:   false,
 			F16KV:     true,
 			UseMLock:  false,
 			UseMMap:   nil,
 		},
--- a/app/lifecycle/lifecycle.go
+++ b/app/lifecycle/lifecycle.go
@ -11,10 +11,12 @@ import (
 	"github.com/ollama/ollama/app/store"
 	"github.com/ollama/ollama/app/tray"
 	"github.com/ollama/ollama/envconfig"
 )
 func Run() {
 	InitLogging()
 	slog.Info("app config", "env", envconfig.Values())
 	ctx, cancel := context.WithCancel(context.Background())
 	var done chan int
--- a/app/lifecycle/paths.go
+++ b/app/lifecycle/paths.go
@ -36,8 +36,13 @@ func init() {
 		ServerLogFile = filepath.Join(AppDataDir, "server.log")
 		UpgradeLogFile = filepath.Join(AppDataDir, "upgrade.log")
-		// Executables are stored in APPDATA
+		exe, err := os.Executable()
 		if err != nil {
 			slog.Warn("error discovering executable directory", "error", err)
 			AppDir = filepath.Join(localAppData, "Programs", "Ollama")
 		} else {
 			AppDir = filepath.Dir(exe)
 		}
 		// Make sure we have PATH set correctly for any spawned children
 		paths := strings.Split(os.Getenv("PATH"), ";")
@ -64,7 +69,7 @@ func init() {
 		}
 		// Make sure our logging dir exists
-		_, err := os.Stat(AppDataDir)
+		_, err = os.Stat(AppDataDir)
 		if errors.Is(err, os.ErrNotExist) {
 			if err := os.MkdirAll(AppDataDir, 0o755); err != nil {
 				slog.Error(fmt.Sprintf("create ollama dir %s: %v", AppDataDir, err))
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@ -18,11 +18,17 @@ func getCLIFullPath(command string) string {
 	var cmdPath string
 	appExe, err := os.Executable()
 	if err == nil {
 		// Check both the same location as the tray app, as well as ./bin
 		cmdPath = filepath.Join(filepath.Dir(appExe), command)
 		_, err := os.Stat(cmdPath)
 		if err == nil {
 			return cmdPath
 		}
 		cmdPath = filepath.Join(filepath.Dir(appExe), "bin", command)
 		_, err = os.Stat(cmdPath)
 		if err == nil {
 			return cmdPath
 		}
 	}
 	cmdPath, err = exec.LookPath(command)
 	if err == nil {
--- a/app/lifecycle/updater_windows.go
+++ b/app/lifecycle/updater_windows.go
@ -26,19 +26,15 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
 	slog.Info("starting upgrade with " + installerExe)
 	slog.Info("upgrade log file " + UpgradeLogFile)
-	// When running in debug mode, we'll be "verbose" and let the installer pop up and prompt
+	// make the upgrade show progress, but non interactive
 	installArgs := []string{
 		"/CLOSEAPPLICATIONS",                    // Quit the tray app if it's still running
 		"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
 		"/FORCECLOSEAPPLICATIONS",               // Force close the tray app - might be needed
 	}
 	// make the upgrade as quiet as possible (no GUI, no prompts)
 	installArgs = append(installArgs,
 		"/SP",                                   // Skip the "This will install... Do you wish to continue" prompt
-		"/SUPPRESSMSGBOXES",
+		"/NOCANCEL",                             // Disable the ability to cancel upgrade mid-flight to avoid partially installed upgrades
 		"/SILENT",
-		"/VERYSILENT",
+	}
 	)
 	// Safeguard in case we have requests in flight that need to drain...
 	slog.Info("Waiting for server to shutdown")
--- a/app/ollama.iss
+++ b/app/ollama.iss
@ -53,8 +53,8 @@ RestartIfNeededByRun=no
 ; https://jrsoftware.org/ishelp/index.php?topic=setup_wizardimagefile
 WizardSmallImageFile=.\assets\setup.bmp
-; TODO verifty actual min windows version...
+; Ollama requires Windows 10 22H2 or newer for proper unicode rendering
-; OG Win 10
+; TODO: consider setting this to 10.0.19045
 MinVersion=10.0.10240
 ; First release that supports WinRT UI Composition for win32 apps
@ -136,7 +136,7 @@ Type: filesandordirs; Name: "{%TEMP}\ollama*"
 Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama"
 [Messages]
-WizardReady=Ollama Windows Preview
+WizardReady=Ollama
 ReadyLabel1=%nLet's get you up and running with your own large language models.
 SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or finish the other installer, then click OK to continue with this install, or Cancel to exit.
--- a/app/tray/wintray/menus.go
+++ b/app/tray/wintray/menus.go
@ -11,12 +11,13 @@ import (
 )
 const (
-	updateAvailableMenuID = 1
+	_ = iota
-	updateMenuID          = updateAvailableMenuID + 1
+	updateAvailableMenuID
-	separatorMenuID       = updateMenuID + 1
+	updateMenuID
-	diagLogsMenuID        = separatorMenuID + 1
+	separatorMenuID
-	diagSeparatorMenuID   = diagLogsMenuID + 1
+	diagLogsMenuID
-	quitMenuID            = diagSeparatorMenuID + 1
+	diagSeparatorMenuID
 	quitMenuID
 )
 func (t *winTray) initMenus() error {
--- a/discover/gpu.go
+++ b/discover/gpu.go
@ -316,7 +316,9 @@ func GetGPUInfo() GpuInfoList {
 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
 				if cHandles.nvml != nil {
-					C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used)
+					uuid := C.CString(gpuInfo.ID)
 					defer C.free(unsafe.Pointer(uuid))
 					C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
 					if memInfo.err != nil {
 						slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
 						C.free(unsafe.Pointer(memInfo.err))
@ -417,7 +419,9 @@ func GetGPUInfo() GpuInfoList {
 		}
 		for i, gpu := range cudaGPUs {
 			if cHandles.nvml != nil {
-				C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
+				uuid := C.CString(gpu.ID)
 				defer C.free(unsafe.Pointer(uuid))
 				C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
 			} else if cHandles.cudart != nil {
 				C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
 			} else if cHandles.nvcuda != nil {
--- a/discover/gpu_info_nvcuda.c
+++ b/discover/gpu_info_nvcuda.c
@ -4,6 +4,7 @@
 #include "gpu_info_nvcuda.h"
 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
  CUresult ret;
  resp->err = NULL;
  resp->num_devices = 0;
@ -57,8 +58,10 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
      resp->cudaErr = -1;
      return;
    }
    LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
  }
  LOG(resp->ch.verbose, "calling cuInit\n");
  ret = (*resp->ch.cuInit)(0);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
@ -75,15 +78,18 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  resp->ch.driver_minor = 0;
  // Report driver version if we're in verbose mode, ignore errors
  LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
  ret = (*resp->ch.cuDriverGetVersion)(&version);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  } else {
    LOG(resp->ch.verbose, "raw version 0x%x\n", version);
    resp->ch.driver_major = version / 1000;
    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
  }
  LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
@ -94,6 +100,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
    resp->cudaErr = ret;
    return;
  }
  LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
 }
 const int buflen = 256;
--- a/discover/gpu_info_nvml.c
+++ b/discover/gpu_info_nvml.c
@ -17,7 +17,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  } l[] = {
      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
-      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
+      {"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
      {NULL, NULL},
  };
@ -67,20 +67,20 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
 }
-void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
+void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) {
    nvmlDevice_t device;
    nvmlMemory_t memInfo = {0};
    nvmlReturn_t ret;
-    ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
+    ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device);
    if (ret != NVML_SUCCESS) {
-        LOG(1, "unable to get device handle %d: %d", device_id, ret);
+        LOG(1, "unable to get device handle %s: %d", uuid, ret);
        *free = 0;
        return;
    }
    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
    if (ret != NVML_SUCCESS) {
-        LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
+        LOG(1, "device memory info lookup failure %s: %d", uuid, ret);
        *free = 0;
        return;
    }
--- a/discover/gpu_info_nvml.h
+++ b/discover/gpu_info_nvml.h
@ -25,7 +25,7 @@ typedef struct nvml_handle {
  uint16_t verbose;
  nvmlReturn_t (*nvmlInit_v2)(void);
  nvmlReturn_t (*nvmlShutdown)(void);
-  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
 } nvml_handle_t;
@ -41,7 +41,7 @@ typedef struct nvml_compute_capability {
 } nvml_compute_capability_t;
 void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
-void nvml_get_free(nvml_handle_t ch,  int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
+void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
 void nvml_release(nvml_handle_t ch);
 #endif  // __GPU_INFO_NVML_H__
--- a/discover/gpu_linux.go
+++ b/discover/gpu_linux.go
@ -3,9 +3,11 @@ package discover
 import (
 	"bufio"
 	"fmt"
 	"io"
 	"os"
 	"reflect"
 	"regexp"
 	"sort"
 	"strings"
 	"github.com/ollama/ollama/format"
@ -109,6 +111,10 @@ func GetCPUDetails() ([]CPU, error) {
 	if err != nil {
 		return nil, err
 	}
 	return linuxCPUDetails(file)
 }
 func linuxCPUDetails(file io.Reader) ([]CPU, error) {
 	reColumns := regexp.MustCompile("\t+: ")
 	scanner := bufio.NewScanner(file)
 	cpuInfos := []linuxCpuInfo{}
@ -131,6 +137,9 @@ func GetCPUDetails() ([]CPU, error) {
 			cpu = &linuxCpuInfo{}
 		}
 	}
 	if cpu.ID != "" {
 		cpuInfos = append(cpuInfos, *cpu)
 	}
 	// Process the sockets/cores/threads
 	socketByID := map[string]*CPU{}
@ -177,10 +186,14 @@ func GetCPUDetails() ([]CPU, error) {
 			s.EfficiencyCoreCount = efficiencyCoreCount
 		}
 	}
-
+	keys := make([]string, 0, len(socketByID))
-	result := []CPU{}
+	result := make([]CPU, 0, len(socketByID))
-	for _, c := range socketByID {
+	for k := range socketByID {
-		result = append(result, *c)
+		keys = append(keys, k)
 	}
 	sort.Strings(keys)
 	for _, k := range keys {
 		result = append(result, *socketByID[k])
 	}
 	return result, nil
 }
--- a/discover/gpu_linux_test.go
+++ b/discover/gpu_linux_test.go
--- a/discover/types.go
+++ b/discover/types.go
@ -175,6 +175,11 @@ func (si SystemInfo) GetOptimalThreadCount() int {
 	if len(si.System.CPUs) == 0 {
 		return 0
 	}
-	// Allocate thread count matching the performance cores on a single socket
+
-	return si.System.CPUs[0].CoreCount - si.System.CPUs[0].EfficiencyCoreCount
+	coreCount := 0
 	for _, c := range si.System.CPUs {
 		coreCount += c.CoreCount - c.EfficiencyCoreCount
 	}
 	return coreCount
 }
--- a/docs/api.md
+++ b/docs/api.md
@ -355,7 +355,6 @@ curl http://localhost:11434/api/generate -d '{
    "num_gpu": 1,
    "main_gpu": 0,
    "low_vram": false,
    "f16_kv": true,
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
--- a/docs/development.md
+++ b/docs/development.md
@ -1,183 +1,5 @@
 # Development
 > [!IMPORTANT]
 > The `llm` package that loads and runs models is being updated to use a new [Go runner](#transition-to-go-runner): this should only impact a small set of PRs however it does change how the project is built.
 Install required tools:
 - cmake version 3.24 or higher
 - go version 1.22 or higher
 - gcc version 11.4.0 or higher
 ### MacOS
 ```bash
 brew install go cmake gcc
 ```
 Optionally enable debugging and more verbose logging:
 ```bash
 # At build time
 export CGO_CFLAGS="-g"
 # At runtime
 export OLLAMA_DEBUG=1
 ```
 Get the required libraries and build the native LLM code:
 ```bash
 go generate ./...
 ```
 Then build ollama:
 ```bash
 go build .
 ```
 Now you can run `ollama`:
 ```bash
 ./ollama
 ```
 ### Linux
 #### Linux CUDA (NVIDIA)
 _Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
 Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 development and runtime packages.
 Typically the build scripts will auto-detect CUDA, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
 libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
 a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
 Then generate dependencies:
 ```
 go generate ./...
 ```
 Then build the binary:
 ```
 go build .
 ```
 #### Linux ROCm (AMD)
 _Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
 Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `cmake` and `golang`.
 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `ROCM_PATH` to the location of the ROCm
 install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
 CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
 the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
 ```
 go generate ./...
 ```
 Then build the binary:
 ```
 go build .
 ```
 ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
 #### Advanced CPU Settings
 By default, running `go generate ./...` will compile a few different variations
 of the LLM library based on common CPU families and vector math capabilities,
 including a lowest-common-denominator which should run on almost any 64 bit CPU
 somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
 load. If you would like to build a CPU-based build customized for your
 processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
 like to use. For example, to compile an optimized binary for an Intel i9-9880H,
 you might use:
 ```
 OLLAMA_CUSTOM_CPU_DEFS="-DGGML_AVX=on -DGGML_AVX2=on -DGGML_F16C=on -DGGML_FMA=on" go generate ./...
 go build .
 ```
 #### Containerized Linux Build
 If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
 ### Windows
 Note: The Windows build for Ollama is still under development.
 First, install required tools:
 - MSVC toolchain - C/C++ and cmake as minimal requirements
 - Go version 1.22 or higher
 - MinGW (pick one variant) with GCC.
  - [MinGW-w64](https://www.mingw-w64.org/)
  - [MSYS2](https://www.msys2.org/)
 - The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
 Then, build the `ollama` binary:
 ```powershell
 $env:CGO_ENABLED="1"
 go generate ./...
 go build .
 ```
 #### Windows CUDA (NVIDIA)
 In addition to the common Windows development tools described above, install CUDA after installing MSVC.
 - [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
 #### Windows ROCm (AMD Radeon)
 In addition to the common Windows development tools described above, install AMDs HIP package after installing MSVC.
 - [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
 - [Strawberry Perl](https://strawberryperl.com/)
 Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
 #### Windows arm64
 The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want.  To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
 ```powershell
 import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
 Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
 ```
 You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
 Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment.  Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
 ```
 pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
 ```
 You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
 ## Transition to Go runner
 The Ollama team is working on moving to a new Go based runner that loads and runs models in a subprocess to replace the previous code under `ext_server`. During this transition period, this new Go runner is "opt in" at build time, and requires using a different approach to build.
 After the transition to use the Go server exclusively, both `make` and `go generate` will build the Go runner.
 Install required tools:
 - go version 1.22 or higher
@ -201,7 +23,7 @@ export OLLAMA_DEBUG=1
 Get the required libraries and build the native LLM code:  (Adjust the job count based on your number of processors for a faster build)
 ```bash
-make -C llama -j 5
+make -j 5
 ```
 Then build ollama:
@ -238,7 +60,7 @@ a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "
 Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
 ```
-make -C llama -j 5
+make -j 5
 ```
 Then build the binary:
@ -263,7 +85,7 @@ the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx
 Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
 ```
-make -C llama -j 5
+make -j 5
 ```
 Then build the binary:
@ -286,7 +108,7 @@ Custom CPU settings are not currently supported in the new Go server build but w
 #### Containerized Linux Build
-If you have Docker available, you can build linux binaries with `OLLAMA_NEW_RUNNERS=1 ./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
+If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
 ### Windows
@ -296,16 +118,19 @@ The following tools are required as a minimal development environment to build C
  - https://go.dev/dl/
 - Git
  - https://git-scm.com/download/win
- GCC and Make.  There are multiple options on how to go about installing these tools on Windows.  We have verified the following, but others may work as well:  
+- clang with gcc compat and Make.  There are multiple options on how to go about installing these tools on Windows.  We have verified the following, but others may work as well:  
  - [MSYS2](https://www.msys2.org/)
-    - After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-ucrt-x86_64-gcc make` to install the required tools
+    - After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-clang-x86_64-gcc-compat mingw-w64-clang-x86_64-clang make` to install the required tools
-  - Assuming you used the default install prefix for msys2 above, add `c:\msys64\ucrt64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)
+  - Assuming you used the default install prefix for msys2 above, add `C:\msys64\clang64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)
 > [!NOTE]  
 > Due to bugs in the GCC C++ library for unicode support, Ollama should be built with clang on windows.
 Then, build the `ollama` binary:
 ```powershell
 $env:CGO_ENABLED="1"
-make -C llama -j 8
+make -j 8
 go build .
 ```
--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@ -10,7 +10,7 @@ This sounds like a typical censored response, but even llama2-uncensored gives a
 So let's figure out how we can use **LangChain** with Ollama to ask our question to the actual document, the Odyssey by Homer, using Python.
-Let's start by asking a simple question that we can get an answer to from the **Llama2** model using **Ollama**. First, we need to install the **LangChain** package:
+Let's start by asking a simple question that we can get an answer to from the **Llama3** model using **Ollama**. First, we need to install the **LangChain** package:
 `pip install langchain_community`
--- a/docs/windows.md
+++ b/docs/windows.md
@ -1,22 +1,15 @@
-# Ollama Windows Preview
+# Ollama Windows
-Welcome to the Ollama Windows preview.
+Welcome to Ollama for Windows.
 No more WSL required!
 Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
-After installing Ollama Windows Preview, Ollama will run in the background and
+After installing Ollama for Windows, Ollama will run in the background and
 the `ollama` command line is available in `cmd`, `powershell` or your favorite
 terminal application. As usual the Ollama [api](./api.md) will be served on
 `http://localhost:11434`.
 As this is a preview release, you should expect a few bugs here and there.  If
 you run into a problem you can reach out on
 [Discord](https://discord.gg/ollama), or file an
 [issue](https://github.com/ollama/ollama/issues).
 Logs will often be helpful in diagnosing the problem (see
 [Troubleshooting](#troubleshooting) below)
 ## System Requirements
 * Windows 10 22H2 or newer, Home or Pro
@ -25,6 +18,32 @@ Logs will often be helpful in diagnosing the problem (see
 Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.
 ## Filesystem Requirements
 The Ollama install does not require Administrator, and installs in your home directory by default.  You'll need at least 4GB of space for the binary install.  Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size.  If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
 ### Changing Install Location
 To install the Ollama application in a location different than your home directory, start the installer with the following flag
 ```powershell
 OllamaSetup.exe /DIR="d:\some\location"
 ```
 ### Changing Model Location
 To change where Ollama stores the downloaded models instead of using your home directory, set the environment variable `OLLAMA_MODELS` in your user account.
 1. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
 2. Click on _Edit environment variables for your account_.
 3. Edit or create a new variable for your user account for `OLLAMA_MODELS` where you want the models stored
 4. Click OK/Apply to save.
 If Ollama is already running, Quit the tray application and relaunch it from the Start menu, or a new terminal started after you saved the environment variables.
 ## API Access
 Here's a quick example showing API access from `powershell`
@ -34,10 +53,6 @@ Here's a quick example showing API access from `powershell`
 ## Troubleshooting
 While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
 a "view logs" menu item to the app, and increases logging for the GUI app and
 server.
 Ollama on Windows stores files in a few different locations.  You can view them in
 the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
@ -52,6 +67,10 @@ the explorer window by hitting `<cmd>+R` and type in:
 The Ollama Windows installer registers an Uninstaller application.  Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.
 > [!NOTE]
 > If you have [changed the OLLAMA_MODELS location](#changing-model-location), the installer will not remove your downloaded models
 ## Standalone CLI
 The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@ -31,7 +31,7 @@ func TestOrcaMiniBlueSky(t *testing.T) {
 }
 func TestUnicode(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
@ -42,9 +42,15 @@ func TestUnicode(t *testing.T) {
 		Options: map[string]interface{}{
 			"temperature": 0,
 			"seed":        123,
 			// Workaround deepseek context shifting bug
 			"num_ctx":     8192,
 			"num_predict": 2048,
 		},
 	}
-	GenerateTestHelper(ctx, t, req, []string{"散射", "频率"})
+	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	require.NoError(t, PullIfMissing(ctx, client, req.Model))
 	DoGenerate(ctx, t, client, req, []string{"散射", "频率"}, 120*time.Second, 120*time.Second)
 }
 func TestExtendedUnicodeOutput(t *testing.T) {
@ -60,7 +66,10 @@ func TestExtendedUnicodeOutput(t *testing.T) {
 			"seed":        123,
 		},
 	}
-	GenerateTestHelper(ctx, t, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"})
+	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	require.NoError(t, PullIfMissing(ctx, client, req.Model))
 	DoGenerate(ctx, t, client, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"}, 120*time.Second, 120*time.Second)
 }
 func TestUnicodeModelDir(t *testing.T) {
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@ -60,7 +60,8 @@ func TestMultiModelConcurrency(t *testing.T) {
 	for i := 0; i < len(req); i++ {
 		go func(i int) {
 			defer wg.Done()
-			DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
+			// Note: CPU based inference can crawl so don't give up too quickly
 			DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 30*time.Second)
 		}(i)
 	}
 	wg.Wait()
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@ -12,7 +12,7 @@ import (
 	"github.com/stretchr/testify/require"
 )
-func TestIntegrationMultimodal(t *testing.T) {
+func TestIntegrationLlava(t *testing.T) {
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
 	require.NoError(t, err)
 	req := api.GenerateRequest{
@ -39,6 +39,33 @@ func TestIntegrationMultimodal(t *testing.T) {
 	DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
 }
 func TestIntegrationMllama(t *testing.T) {
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
 	require.NoError(t, err)
 	req := api.GenerateRequest{
 		// TODO fix up once we publish the final image
 		Model:  "x/llama3.2-vision",
 		Prompt: "what does the text in this image say?",
 		Stream: &stream,
 		Options: map[string]interface{}{
 			"seed":        42,
 			"temperature": 0.0,
 		},
 		Images: []api.ImageData{
 			image,
 		},
 	}
 	resp := "the ollamas"
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	require.NoError(t, PullIfMissing(ctx, client, req.Model))
 	// mllama models on CPU can be quite slow to start,
 	DoGenerate(ctx, t, client, req, []string{resp}, 240*time.Second, 30*time.Second)
 }
 const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb
 AAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAABIAAAAAQAAAEgAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAANKgAwAEAAAAAQAA
 AHgAAAAAXdsepgAAAAlwSFlzAAALEwAACxMBAJqcGAAAAVlpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6
--- a/llama/Dockerfile
+++ b/llama/Dockerfile
@ -1,221 +0,0 @@
 # Note: once we have fully transitioned to the Go server, this will replace the old Dockerfile at the top of the tree
 ARG GOLANG_VERSION=1.22.8
 ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION_11=11.3.1
 ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
 ### To create a local image for building linux binaries on mac or windows with efficient incremental builds
 #
 # docker build --platform linux/amd64 -t builder-amd64 -f llama/Dockerfile --target unified-builder-amd64 .
 # docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
 #
 ### Then incremental builds will be much faster in this container
 #
 # make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
 #
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
    dnf clean all && \
    dnf install -y \
    zsh \
    cuda-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
    cuda-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
 # TODO intel oneapi goes here...
 ENV GOARCH amd64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]
 ### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
 # Note: this does not contain jetson variants
 #
 # docker build --platform linux/arm64 -t builder-arm64 -f llama/Dockerfile --target unified-builder-arm64 .
 # docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
 #
 FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
    dnf config-manager --set-enabled appstream && \
    dnf clean all && \
    dnf install -y \
    zsh \
    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
 ENV GOARCH amd64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]
 FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
 ARG OLLAMA_SKIP_CUDA_11_GENERATE
 ARG OLLAMA_SKIP_CUDA_12_GENERATE
 ARG OLLAMA_SKIP_ROCM_GENERATE
 ARG CUDA_V11_ARCHITECTURES
 ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
        make -C llama -j $(expr $(nproc) / 2 ) ; \
    else \
        make -C llama -j 5 ; \
    fi
 FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
 ARG OLLAMA_SKIP_CUDA_11_GENERATE
 ARG OLLAMA_SKIP_CUDA_12_GENERATE
 ARG CUDA_V11_ARCHITECTURES
 ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
    make -C llama -j 8
 # Intermediate stages used for ./scripts/build_linux.sh
 FROM --platform=linux/amd64 centos:7 AS builder-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 ENV CGO_ENABLED 1
 ENV GOARCH amd64
 WORKDIR /go/src/github.com/ollama/ollama
 FROM --platform=linux/amd64 builder-amd64 AS build-amd64
 COPY . .
 COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 ARG OLLAMA_SKIP_ROCM_GENERATE
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-amd64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
 RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
    cd dist/linux-$GOARCH-rocm && \
    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
    fi
 FROM --platform=linux/arm64 rockylinux:8 AS builder-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 ENV CGO_ENABLED 1
 ENV GOARCH arm64
 WORKDIR /go/src/github.com/ollama/ollama
 FROM --platform=linux/arm64 builder-arm64 AS build-arm64
 COPY . .
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-arm64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
 FROM --platform=linux/amd64 scratch AS dist-amd64
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
 FROM --platform=linux/arm64 scratch AS dist-arm64
 COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
 FROM dist-$TARGETARCH AS dist
 # Optimized container images do not cary nested payloads
 FROM --platform=linux/amd64 builder-amd64 AS container-build-amd64
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-amd64/bin/ollama .
 FROM --platform=linux/arm64 builder-arm64 AS container-build-arm64
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
    go build -trimpath -o dist/linux-arm64/bin/ollama .
 # For amd64 container images, filter out cuda/rocm to minimize size
 FROM runners-amd64 AS runners-cuda-amd64
 RUN rm -rf \
    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
    ./dist/linux-amd64/lib/ollama/runners/rocm*
 FROM runners-amd64 AS runners-rocm-amd64
 RUN rm -rf \
    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
    ./dist/linux-amd64/lib/ollama/libcu*.so* \
    ./dist/linux-amd64/lib/ollama/runners/cuda*
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
 COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 # ROCm libraries larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
 # Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
 # across releases
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
 RUN apt-get update && \
    apt-get install -y ca-certificates && \
    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
 FROM runtime-$TARGETARCH
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_VISIBLE_DEVICES=all
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
--- a/llama/README.md
+++ b/llama/README.md
@ -95,31 +95,17 @@ make -j
 Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model.  While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit.  A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
 > [!IMPORTANT]
 > Prior to merging #7157 we continue to leverage a submodule for llama.cpp which establishes the tracking commit.  After merging that PR a new manifest file we be utilized
 If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.
 ```
-make -C llama apply-patches
+make apply-patches
 ```
 ### Updating Base Commit
 **Pin to new base commit**
-To update to a newer base commit, select the upstream git tag or commit
+To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring.env`
 > [!IMPORTANT]
 > After merging #7157 a manifest will be used instead of the submodule
 ```
 cd llm/llama.cpp
 git fetch
 git checkout NEW_BASE_COMMIT
 cd ..
 git add llama.cpp
 ```
 #### Applying patches
@ -128,13 +114,13 @@ When updating to a newer base commit, the existing patches may not apply cleanly
 Start by applying the patches.  If any of the patches have conflicts, the `git am` will stop at the first failure.
 ```
-make -C llama apply-patches
+make apply-patches
 ```
 If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed.  Save the file(s) and continue the patch series with `git am --continue` .  If any additional patches fail, follow the same pattern until the full patch series is applied.  Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
 ```
-make -C llama create-patches sync
+make create-patches sync
 ```
 Build and test Ollama, and make any necessary changes to the Go code based on the new base commit.  Submit your PR to the Ollama repo.
@ -144,14 +130,14 @@ Build and test Ollama, and make any necessary changes to the Go code based on th
 When working on new fixes or features that impact vendored code, use the following model.  First get a clean tracking repo with all current patches applied:
 ```
-make -C llama apply-patches
+make apply-patches
 ```
 Now edit the upstream native code in the `./vendor/` directory.  You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing.  Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
 ```
-make -C llama sync
+make sync
-make -C llama -j 8
+make -j 8
 go build .
 ```
@ -161,7 +147,7 @@ go build .
 Iterate until you're ready to submit PRs.  Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
 ```
-make -C llama create-patches
+make create-patches
 ```
 > [!IMPORTANT]
--- a/llama/llama.cpp
+++ b/llama/llama.cpp
@ -2699,7 +2699,7 @@ struct llama_hparams {
        GGML_ABORT("fatal error");
    }
-    bool cross_attention_layer(uint32_t il) const {
+    bool cross_attention_layers(uint32_t il) const {
        return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
    }
 };
@ -2731,6 +2731,9 @@ struct llama_cparams {
    bool offload_kqv;
    bool flash_attn;
    bool no_perf;
    // TODO (jmorganca): this should most likely be passed in as part of a batch
    // and not set on the context for all batches.
    bool cross_attn = false;
    enum llama_pooling_type pooling_type;
@ -3542,10 +3545,6 @@ struct llama_context {
    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
    // TODO (jmorganca): this should most likely be passed in as part of a batch
    // and not set on the context for all batches.
    float * cross_attn_state = nullptr;
    bool cross_attn_state_first_pass = true;
    struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
 };
@ -3782,7 +3781,7 @@ static bool llama_kv_cache_init(
    for (int i = 0; i < (int) n_layer; i++) {
        // for cross attention layers
-        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layer(i)) {
+        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
            struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
            ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
            ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
@ -7389,7 +7388,7 @@ static bool llm_load_tensors(
                        auto & layer = model.layers[i];
-                        if (hparams.cross_attention_layer(i)) {
+                        if (hparams.cross_attention_layers(i)) {
                            layer.cross_attn_k_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128});
                            layer.cross_attn_k_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024});
                            layer.cross_attn_o_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd});
@ -9368,11 +9367,10 @@ static struct ggml_tensor * llm_build_inp_cross_attn_state(
         const llm_build_cb & cb) {
    const int64_t n_embd = hparams.n_embd;
-    struct ggml_tensor * inpCAS;
+    struct ggml_tensor * inpCAS = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
-    lctx.inp_cross_attn_state = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
+    cb(inpCAS, "inp_cross_attn_state", -1);
-    cb(lctx.inp_cross_attn_state, "inp_cross_attn_state", -1);
+    ggml_set_input(inpCAS);
-    ggml_set_input(lctx.inp_cross_attn_state);
+    lctx.inp_cross_attn_state = inpCAS;
    inpCAS = lctx.inp_cross_attn_state;
    return inpCAS;
 }
@ -10979,8 +10977,8 @@ struct llm_build_context {
                    LLM_NORM_RMS, cb, il);
            cb(cur, "attn_norm", il);
-            if (hparams.cross_attention_layer(il)) {
+            if (hparams.cross_attention_layers(il)) {
-                if (!lctx.cross_attn_state) {
+                if (!batch.embd && !cparams.cross_attn) {
                    continue;
                }
@ -10991,42 +10989,28 @@ struct llm_build_context {
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
                cb(Qcur, "Qcur", il);
-                Qcur = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
                cb(Qcur, "Qcur", il);
                // TODO: is this required?
                Qcur = ggml_cont(ctx0, Qcur);
                cb(Qcur, "Qcur", il);
                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
                cb(Qcur, "Qcur", il);
-                struct ggml_tensor * Kcur;
+                struct ggml_tensor * Kcur, * Vcur;
-                if (lctx.cross_attn_state_first_pass) {
+                if (batch.embd) {
                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
                    cb(Kcur, "Kcur", il);
                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
                    cb(Kcur, "Kcur", il);
-                    Kcur = ggml_permute(ctx0, Kcur, 0, 2, 1, 3);
+                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
                    cb(Kcur, "Kcur", il);
                    // TODO: is this required?
                    Kcur = ggml_cont(ctx0, Kcur);
                    cb(Kcur, "Kcur", il);
                    Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
                    cb(Kcur, "Kcur", il);
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self.k_l[il]));
                } else {
                    Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
                    cb(Kcur, "Kcur (view)", il);
                }
                struct ggml_tensor * Vcur;
                if (lctx.cross_attn_state_first_pass) {
                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
                    cb(Vcur, "Vcur", il);
@ -11038,6 +11022,9 @@ struct llm_build_context {
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self.v_l[il]));
                } else {
                    Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
                    cb(Kcur, "Kcur (view)", il);
                    Vcur = ggml_view_tensor(ctx0, kv_self.v_l[il]);
                    cb(Vcur, "Vcur (view)", il);
                }
@ -11045,11 +11032,8 @@ struct llm_build_context {
                struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
                cb(kq, "kq", il);
                kq = ggml_scale_inplace(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
                cb(kq, "kq_scaled", il);
                // TODO: apply causal masks
-                struct ggml_tensor * kq_soft_max = ggml_soft_max_inplace(ctx0, kq);
+                struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
                cb(kq_soft_max, "kq_soft_max", il);
                Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
@ -17197,11 +17181,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
    }
    if (batch.embd) {
        if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
            ggml_backend_tensor_set(lctx.inp_cross_attn_state, batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
            // zero out inp_embd since it's not used
            float * inp_embd_data = (float *)lctx.inp_embd->data;
            for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
                inp_embd_data[i] = 0.0f;
            }
        } else {
            const int64_t n_embd   = hparams.n_embd;
            const int64_t n_tokens = batch.n_tokens;
            ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
        }
    }
    if (batch.pos && lctx.inp_pos) {
        const int64_t n_tokens = batch.n_tokens;
@ -17209,14 +17202,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
        ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
    }
    // TODO (jmorganca): this might copy a lot of data on every request of a
    // single generation even though it doesn't change, so we should
    // find a way to not set this more than one time per image
    if (lctx.inp_cross_attn_state &&
        lctx.inp_cross_attn_state->buffer) {
        ggml_backend_tensor_set(lctx.inp_cross_attn_state, lctx.cross_attn_state, 0, hparams.n_embd * 1601 * 4 * ggml_element_size(lctx.inp_cross_attn_state));
    }
    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
        GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
        const int64_t n_tokens = batch.n_tokens;
@ -17789,7 +17774,7 @@ static int llama_decode_internal(
        n_outputs = 1;
    }
-    lctx.sbatch.from_batch(batch_all, n_embd,
+    lctx.sbatch.from_batch(batch_all, batch_all.n_embd,
        /* simple_split */ !kv_self.recurrent,
        /* logits_all   */ n_outputs == n_tokens_all);
@ -17899,10 +17884,6 @@ static int llama_decode_internal(
        llama_set_inputs(lctx, ubatch);
        // TODO: replace with something better to find out if its
        // our first actual pass
        lctx.cross_attn_state_first_pass = false;
        llama_graph_compute(lctx, gf, n_threads, threadpool);
        // update the kv ring buffer
@ -18086,7 +18067,7 @@ static int llama_encode_internal(
    const int64_t n_embd = hparams.n_embd;
-    lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+    lctx.sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
    const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
@ -20194,11 +20175,6 @@ struct llama_context * llama_new_context_with_model(
    return ctx;
 }
 void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state) {
    ctx->cross_attn_state_first_pass = true;
    ctx->cross_attn_state = cross_attn_state;
 }
 void llama_free(struct llama_context * ctx) {
    delete ctx;
 }
@ -21686,6 +21662,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
    ctx->cparams.causal_attn = causal_attn;
 }
 void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
    ctx->cparams.cross_attn = cross_attention;
 }
 struct llama_batch llama_batch_get_one(
             llama_token * tokens,
                 int32_t   n_tokens,
@ -21695,6 +21675,7 @@ struct llama_batch llama_batch_get_one(
        /*n_tokens       =*/ n_tokens,
        /*tokens         =*/ tokens,
        /*embd           =*/ nullptr,
        /*n_embd         =*/ 0,
        /*pos            =*/ nullptr,
        /*n_seq_id       =*/ nullptr,
        /*seq_id         =*/ nullptr,
@ -21710,6 +21691,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
        /*n_tokens       =*/ 0,
        /*tokens         =*/ nullptr,
        /*embd           =*/ nullptr,
        /*n_embd         =*/ 0,
        /*pos            =*/ nullptr,
        /*n_seq_id       =*/ nullptr,
        /*seq_id         =*/ nullptr,
@ -21721,6 +21703,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
    if (embd) {
        batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
        batch.n_embd = embd;
    } else {
        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
    }
--- a/llama/llama.go
+++ b/llama/llama.go
@ -1,5 +1,7 @@
 package llama
 //go:generate make -j 8
 /*
 #cgo CFLAGS: -O2 -std=c11 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE
 #cgo CXXFLAGS: -O2 -std=c++11 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE
@ -66,6 +68,17 @@ package llama
 #include "sampling_ext.h"
 bool llamaProgressCallback(float progress, void *user_data);
 typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
 COMPILER inline get_compiler() {
 #if defined(__clang__)
 	return COMP_CLANG;
 #elif defined(__GNUC__)
 	return COMP_GCC;
 #else
 	return UNKNOWN_COMPILER;
 #endif
 }
 */
 import "C"
@ -75,6 +88,7 @@ import (
 	"fmt"
 	"runtime"
 	"runtime/cgo"
 	"slices"
 	"strings"
 	"unsafe"
 )
@ -86,7 +100,38 @@ func BackendInit() {
 }
 func PrintSystemInfo() string {
-	return C.GoString(C.llama_print_system_info())
+	var compiler string
 	switch C.get_compiler() {
 	case C.COMP_UNKNOWN:
 		compiler = "cgo(unknown_compiler)"
 	case C.COMP_GCC:
 		compiler = "cgo(gcc)"
 	case C.COMP_CLANG:
 		compiler = "cgo(clang)"
 	}
 	return C.GoString(C.llama_print_system_info()) + compiler
 }
 func GetModelArch(modelPath string) (string, error) {
 	mp := C.CString(modelPath)
 	defer C.free(unsafe.Pointer(mp))
 	gguf_ctx := C.gguf_init_from_file(mp, C.struct_gguf_init_params{no_alloc: true, ctx: (**C.struct_ggml_context)(C.NULL)})
 	if gguf_ctx == nil {
 		return "", errors.New("unable to load model file")
 	}
 	defer C.gguf_free(gguf_ctx)
 	key := C.CString("general.architecture")
 	defer C.free(unsafe.Pointer(key))
 	arch_index := C.gguf_find_key(gguf_ctx, key)
 	if int(arch_index) < 0 {
 		return "", errors.New("unknown model architecture")
 	}
 	arch := C.gguf_get_val_str(gguf_ctx, arch_index)
 	return C.GoString(arch), nil
 }
 type ContextParams struct {
@ -216,7 +261,7 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 	}
 	m := Model{c: C.llama_load_model_from_file(C.CString(modelPath), cparams)}
-	if m.c == (*C.struct_llama_model)(C.NULL) {
+	if m.c == nil {
 		return nil, fmt.Errorf("unable to load model: %s", modelPath)
 	}
@ -232,7 +277,7 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) {
 		c:          C.llama_new_context_with_model(model.c, params.c),
 		numThreads: int(params.c.n_threads),
 	}
-	if c.c == (*C.struct_llama_context)(C.NULL) {
+	if c.c == nil {
 		return nil, errors.New("unable to create llama context")
 	}
@ -256,6 +301,9 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
 	defer C.free(unsafe.Pointer(cLoraPath))
 	loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath)
 	if loraAdapter == nil {
 		return errors.New("unable to load lora")
 	}
 	err := -1
 	if loraAdapter != nil {
@ -271,18 +319,40 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
 type Batch struct {
 	c         C.struct_llama_batch
 	batchSize int
 	maxSeq    int
 	embedSize int
 }
-// Creates a new batch for either word tokens if embed is 0 or
+// Creates a new batch for either word tokens or image embeddings (if embedSize is non-zero).
-// image embeddings if embed is specified. Batches cannot contain
+// Batches cannot contain both types at the same time. batchSize is the maximum number of entries
-// both types at the same time
+// that can be added per sequence
-func NewBatch(nTokens int, embed int, maxSeq int) *Batch {
+func NewBatch(batchSize int, maxSeq int, embedSize int) (*Batch, error) {
-	return &Batch{
+	b := Batch{
-		c:         C.llama_batch_init(C.int(nTokens), C.int(embed), C.int(maxSeq)),
+		c:         C.llama_batch_init(C.int(batchSize*maxSeq), C.int(embedSize), C.int(maxSeq)),
-		batchSize: nTokens,
+		batchSize: batchSize,
-		embedSize: embed,
+		maxSeq:    maxSeq,
 		embedSize: embedSize,
 	}
 	// Check to see if any of the allocations in llama_batch_init() failed
 	nilPointer := (embedSize == 0 && b.c.token == nil) || (embedSize != 0 && b.c.embd == nil) ||
 		b.c.pos == nil || b.c.n_seq_id == nil || b.c.seq_id == nil || b.c.logits == nil ||
 		slices.Contains(unsafe.Slice(b.c.seq_id, b.allocSize()), nil)
 	if nilPointer {
 		C.llama_batch_free(b.c)
 		return nil, fmt.Errorf("unable to allocate batch (batchSize=%v maxSeq=%v embedSize=%v)", batchSize, maxSeq, embedSize)
 	}
 	return &b, nil
 }
 func (b *Batch) Size() int {
 	return b.batchSize
 }
 func (b *Batch) allocSize() int {
 	return b.batchSize * b.maxSeq
 }
 func (b *Batch) NumTokens() int {
@ -297,21 +367,21 @@ func (b *Batch) IsEmbedding() bool {
 // when the batch was initialized. The other argument will be ignored. Adds to the
 // batch with the given position for the given sequence ids, and optionally instructs
 // to include logits.
-func (b *Batch) Add(token int, embed []float32, pos int, seqIds []int, logits bool) {
+func (b *Batch) Add(token int, embed []float32, pos int, logits bool, seqIds ...int) {
 	if !b.IsEmbedding() {
-		unsafe.Slice(b.c.token, b.batchSize)[b.c.n_tokens] = C.llama_token(token)
+		unsafe.Slice(b.c.token, b.allocSize())[b.c.n_tokens] = C.llama_token(token)
 	} else {
-		copy(unsafe.Slice((*float32)(b.c.embd), b.batchSize*b.embedSize)[int(b.c.n_tokens)*b.embedSize:], embed)
+		copy(unsafe.Slice((*float32)(b.c.embd), b.allocSize()*b.embedSize)[int(b.c.n_tokens)*b.embedSize:], embed)
 	}
-	unsafe.Slice(b.c.pos, b.batchSize)[b.c.n_tokens] = C.llama_pos(pos)
+	unsafe.Slice(b.c.pos, b.allocSize())[b.c.n_tokens] = C.llama_pos(pos)
-	unsafe.Slice(b.c.n_seq_id, b.batchSize)[b.c.n_tokens] = C.int(len(seqIds))
+	unsafe.Slice(b.c.n_seq_id, b.allocSize())[b.c.n_tokens] = C.int(len(seqIds))
 	for i, s := range seqIds {
-		unsafe.Slice((unsafe.Slice(b.c.seq_id, b.batchSize)[b.c.n_tokens]), C.int(len(seqIds)))[i] = C.int32_t(s)
+		unsafe.Slice((unsafe.Slice(b.c.seq_id, b.allocSize())[b.c.n_tokens]), C.int(len(seqIds)))[i] = C.int32_t(s)
 	}
 	if logits {
-		unsafe.Slice(b.c.logits, b.batchSize)[b.c.n_tokens] = 1
+		unsafe.Slice(b.c.logits, b.allocSize())[b.c.n_tokens] = 1
 	}
 	b.c.n_tokens += 1
@ -421,71 +491,42 @@ func Quantize(infile, outfile string, ftype uint32) error {
 	return nil
 }
-// llava
+// vision processing
 type ClipContext struct {
 	c *C.struct_clip_ctx
 	m        *C.struct_mllama_ctx
 	IsMllama bool
 	embedPin runtime.Pinner
 	pinned   bool
 }
-func getVisionArch(mp *C.char) (string, error) {
+func NewClipContext(llamaContext *Context, modelPath string) (*ClipContext, error) {
 	gguf_ctx := C.gguf_init_from_file(mp, C.struct_gguf_init_params{no_alloc: true, ctx: (**C.struct_ggml_context)(C.NULL)})
 	if gguf_ctx == nil {
 		return "", errors.New("unable to load vision projector")
 	}
 	defer C.gguf_free(gguf_ctx)
 	arch_index := C.gguf_find_key(gguf_ctx, C.CString("general.architecture"))
 	if int(arch_index) < 0 {
 		return "", errors.New("unknown vision model architecture")
 	}
 	arch := C.gguf_get_val_str(gguf_ctx, arch_index)
 	return C.GoString(arch), nil
 }
 func NewClipContext(modelPath string) (*ClipContext, error) {
 	mp := C.CString(modelPath)
 	defer C.free(unsafe.Pointer(mp))
-
+	c := C.clip_model_load(mp, 1)
-	arch, err := getVisionArch(mp)
+	if c == nil {
-	if err != nil {
+		return nil, fmt.Errorf("unable to load clip model: %v", modelPath)
 		return nil, err
 	}
-	var cc ClipContext
+	projEmbedSize := int(C.clip_n_mmproj_embd(c))
-	if arch == "clip" {
+	modelEmbedSize := llamaContext.Model().NEmbd()
-		cc.c = C.clip_model_load(mp, 1)
+	if projEmbedSize != modelEmbedSize {
-	} else if arch == "mllama" {
+		return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
 		cc.m = C.mllama_model_load(mp, 1)
 		cc.IsMllama = true
 	} else {
 		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
 	}
-	// XXX: check embedding size?
+	return &ClipContext{c: c}, nil
 	return &cc, nil
 }
 func (c *ClipContext) Free() {
 	if c.c != nil {
 	C.clip_free(c.c)
 	}
 	if c.m != nil {
 		C.mllama_free(c.m)
 	}
 }
-func NewLlavaImageEmbed(llamaContext *Context, clipContext *ClipContext, data []byte) [][]float32 {
+func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, error) {
-	c := C.llava_image_embed_make_with_bytes(clipContext.c, C.int(llamaContext.numThreads), (*C.uchar)(unsafe.Pointer(&data[0])), C.int(len(data)))
+	l := C.llava_image_embed_make_with_bytes(c.c, C.int(llamaContext.numThreads), (*C.uchar)(unsafe.Pointer(&data[0])), C.int(len(data)))
 	if l == nil {
 		return nil, errors.New("unable to make llava embedding from image")
 	}
-	numTokens := int(c.n_image_pos)
+	numTokens := int(l.n_image_pos)
 	numEmbed := llamaContext.Model().NEmbd()
-	s := unsafe.Slice((*float32)(c.embed), numEmbed*numTokens)
+	s := unsafe.Slice((*float32)(l.embed), numEmbed*numTokens)
 	embed := make([][]float32, numTokens)
 	rows := make([]float32, len(s))
@ -495,51 +536,66 @@ func NewLlavaImageEmbed(llamaContext *Context, clipContext *ClipContext, data []
 		embed[i] = rows[i*numEmbed : (i+1)*numEmbed]
 	}
-	C.llava_image_embed_free(c)
+	C.llava_image_embed_free(l)
-	return embed
+	return embed, nil
 }
-func NewMllamaImageEmbed(llamaContext *Context, clipContext *ClipContext, data []byte, aspectRatioId int) [][]float32 {
+type MllamaContext struct {
 	c *C.struct_mllama_ctx
 }
 func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
 	mp := C.CString(modelPath)
 	defer C.free(unsafe.Pointer(mp))
 	c := C.mllama_model_load(mp, 1)
 	if c == nil {
 		return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
 	}
 	projEmbedSize := int(C.mllama_n_embd(c))
 	modelEmbedSize := llamaContext.Model().NEmbd()
 	if projEmbedSize != modelEmbedSize {
 		return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
 	}
 	return &MllamaContext{c: c}, nil
 }
 func (m *MllamaContext) Free() {
 	C.mllama_free(m.c)
 }
 func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
 	img := C.mllama_image_init()
 	defer C.mllama_image_free(img)
-	C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img)
+	ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
-
+	if !ok {
-	numTokens := int(C.mllama_n_positions(clipContext.m) * C.mllama_n_tiles(clipContext.m))
+		return nil, errors.New("unable to load mllama image data")
 	numEmbed := llamaContext.Model().NEmbd()
 	rows := make([]float32, numEmbed*numTokens)
 	C.mllama_image_encode(clipContext.m, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0])))
 	embed := make([][]float32, numTokens)
 	for i := range embed {
 		embed[i] = rows[i*numEmbed : (i+1)*numEmbed]
 	}
-	return embed
+	rows := make([]float32, m.EmbedSize(llamaContext))
 	ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
 	if !ok {
 		return nil, errors.New("unable to make mllama embedding from image")
 	}
 	embed := make([][]float32, 1)
 	embed[0] = rows
 	return embed, nil
 }
-// This really needs to be set on a batch instead
+func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
-func MllamaSetCrossAttn(llamaContext *Context, clipContext *ClipContext, embed [][]float32) {
+	numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
-	if embed != nil {
+	numEmbed := llamaContext.Model().NEmbd()
 		if clipContext.pinned {
 			panic("Cross attention state already pinned")
 		}
-		embedData := &embed[0][0]
+	return numTokens * numEmbed
-		clipContext.embedPin.Pin(embedData)
+}
 		clipContext.pinned = true
-		C.llama_set_cross_attn_state(llamaContext.c, (*C.float)(unsafe.Pointer(embedData)))
+func (c *Context) SetCrossAttention(state bool) {
-	} else {
+	C.llama_set_cross_attention(c.c, C.bool(state))
 		C.llama_set_cross_attn_state(llamaContext.c, (*C.float)(C.NULL))
 		if clipContext.pinned {
 			clipContext.embedPin.Unpin()
 			clipContext.pinned = false
 		}
 	}
 }
 // sampling
@ -567,7 +623,7 @@ type SamplingParams struct {
 	Grammar        string
 }
-func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext {
+func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, error) {
 	var cparams C.struct_gpt_sampler_cparams
 	cparams.top_k = C.int32_t(params.TopK)
 	cparams.top_p = C.float(params.TopP)
@ -590,9 +646,13 @@ func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext {
 	cparams.grammar = grammar
 	context := &SamplingContext{c: C.gpt_sampler_cinit(model.c, &cparams)}
 	if context.c == nil {
 		return nil, errors.New("unable to create sampling context")
 	}
 	runtime.SetFinalizer(context, func(s *SamplingContext) { C.gpt_sampler_cfree(s.c) })
-	return context
+	return context, nil
 }
 func (s *SamplingContext) Reset() {
--- a/llama/llama.h
+++ b/llama/llama.h
@ -266,6 +266,7 @@ extern "C" {
        llama_token  *  token;
        float        *  embd;
        int32_t         n_embd;
        llama_pos    *  pos;
        int32_t      *  n_seq_id;
        llama_seq_id ** seq_id;
@ -451,7 +452,7 @@ extern "C" {
    // TODO (jmorganca): this should most likely be passed in as part of a batch
    // and not set on the context for all batches.
-    LLAMA_API void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state);
+    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
--- a/llama/llava.cpp
+++ b/llama/llava.cpp
@ -435,7 +435,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), n_embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
            LOG_ERR("%s : failed to eval\n", __func__);
            return false;
--- a/llama/make/Makefile.rocm
+++ b/llama/make/Makefile.rocm
@ -58,6 +58,8 @@ endif
 GPU_COMPILER_CUFLAGS = \
 	$(GPU_COMPILER_FPIC) \
 	$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
 	-mf16c \
 	-mfma \
 	-parallel-jobs=2 \
 	-c \
 	-O3 \
@ -77,6 +79,9 @@ GPU_COMPILER_CUFLAGS = \
 	-D_CRT_SECURE_NO_WARNINGS \
 	-D_GNU_SOURCE \
 	-D_XOPEN_SOURCE=600 \
 	-DUSE_PROF_API=1 \
 	-std=gnu++14 \
 	-x hip \
 	-mllvm=-amdgpu-early-inline-all=true \
 	-mllvm=-amdgpu-function-calls=false \
 	-Wno-expansion-to-defined \
@ -87,6 +92,12 @@ GPU_COMPILER_CUFLAGS = \
 	-Wno-unused-result \
 	-I.
 # Workaround buggy P2P copy on some windows multi-GPU setups
 # This workaround breaks linux systems with small system RAM, so only enable on windows
 ifeq ($(OS),windows)
 	GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1
 endif
 include make/gpu.make
 # Adjust the rules from gpu.make to handle the ROCm dependencies properly
--- a/llama/make/Makefile.sync
+++ b/llama/make/Makefile.sync
@ -1,11 +1,12 @@
 # Helpers for managing our vendored llama.cpp repo and patch set
-# TODO - this should include a manifest file at the top of the tree 
+REPO_ROOT:=$(dir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
-LLAMACPP_BASE_COMMIT=$(shell cd ../llm/llama.cpp && git rev-parse HEAD)
+DST_DIR:=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
-LLAMACPP_REPO := $(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))vendor/
+include $(REPO_ROOT)llama/vendoring
 LLAMACPP_REPO := $(REPO_ROOT)llama/vendor/
 DST_DIR=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
 LLAMACPP_PATCH_DIR := $(DST_DIR)patches/
--- a/llama/make/gpu.make
+++ b/llama/make/gpu.make
@ -85,7 +85,7 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS
 	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie  $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
 $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
 	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) --shared $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
+	$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
 # Distribution targets
 $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
--- a/llama/patches/0010-add-mllama-support.patch
+++ b/llama/patches/0010-add-mllama-support.patch
@ -12,27 +12,49 @@ kv cache once per run
 remaining is to implement the cross attention mask
 ---
- include/llama.h |   4 +
+ examples/llava/llava.cpp |   2 +-
- src/llama.cpp   | 456 ++++++++++++++++++++++++++++++++++++++++++++++--
+ include/llama.h          |   5 +
- 2 files changed, 447 insertions(+), 13 deletions(-)
+ src/llama.cpp            | 447 +++++++++++++++++++++++++++++++++++++--
 3 files changed, 436 insertions(+), 18 deletions(-)
 diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
 index 8558c6bd..37b2f2e2 100644
 --- a/examples/llava/llava.cpp
 +++ b/examples/llava/llava.cpp
@@ -409,7 +409,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
 -        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
 +        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), n_embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
         if (llama_decode(ctx_llama, batch)) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
 diff --git a/include/llama.h b/include/llama.h
-index 7cae1bbe..122e3cf1 100644
+index 7cae1bbe..aca09310 100644
 --- a/include/llama.h
 +++ b/include/llama.h
-@@ -423,6 +423,10 @@ extern "C" {
+@@ -240,6 +240,7 @@ extern "C" {
         llama_token  *  token;
         float        *  embd;
 +        int32_t         n_embd;
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
@@ -423,6 +424,10 @@ extern "C" {
                      struct llama_model * model,
             struct llama_context_params   params);
 +    // TODO (jmorganca): this should most likely be passed in as part of a batch
 +    // and not set on the context for all batches.
-+    LLAMA_API void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state);
+    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
 +
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 83b80b59..b189a19a 100644
+index 83b80b59..35748488 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -169,6 +169,7 @@ static std::string format(const char * fmt, ...) {
@ -160,13 +182,23 @@ index 83b80b59..b189a19a 100644
         GGML_ABORT("fatal error");
     }
 +
-+    bool cross_attention_layer(uint32_t il) const {
+    bool cross_attention_layers(uint32_t il) const {
 +        return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
 +    }
 };
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2806,6 +2859,16 @@ struct llama_layer {
+@@ -2652,6 +2705,9 @@ struct llama_cparams {
     bool offload_kqv;
     bool flash_attn;
     bool no_perf;
 +    // TODO (jmorganca): this should most likely be passed in as part of a batch
 +    // and not set on the context for all batches.
 +    bool cross_attn = false;
     enum llama_pooling_type pooling_type;
@@ -2806,6 +2862,16 @@ struct llama_layer {
     struct ggml_tensor * ffn_down_scale;
     struct ggml_tensor * bskcn_tv;
@ -183,25 +215,21 @@ index 83b80b59..b189a19a 100644
 };
 // very similar to llama_batch,
-@@ -3452,6 +3515,12 @@ struct llama_context {
+@@ -3452,6 +3518,8 @@ struct llama_context {
     struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
 +
 +    // TODO (jmorganca): this should most likely be passed in as part of a batch
 +    // and not set on the context for all batches.
 +    float * cross_attn_state = nullptr;
 +    bool cross_attn_state_first_pass = true;
 +    struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
 };
 struct llama_lora_weight {
-@@ -3686,6 +3755,18 @@ static bool llama_kv_cache_init(
+@@ -3686,6 +3754,18 @@ static bool llama_kv_cache_init(
     cache.v_l.reserve(n_layer);
     for (int i = 0; i < (int) n_layer; i++) {
 +        // for cross attention layers
-+        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layer(i)) {
+        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
 +            struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
 +            ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
 +            ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
@ -215,7 +243,7 @@ index 83b80b59..b189a19a 100644
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
-@@ -5460,12 +5541,14 @@ static void llm_load_hparams(
+@@ -5460,12 +5540,14 @@ static void llm_load_hparams(
     }
     // zero-out the per-layer hparams
@ -235,7 +263,7 @@ index 83b80b59..b189a19a 100644
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -5514,7 +5597,7 @@ static void llm_load_hparams(
+@@ -5514,7 +5596,7 @@ static void llm_load_hparams(
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
@ -244,7 +272,7 @@ index 83b80b59..b189a19a 100644
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
-@@ -5554,6 +5637,16 @@ static void llm_load_hparams(
+@@ -5554,6 +5636,16 @@ static void llm_load_hparams(
                     }
                 }
             } break;
@ -261,7 +289,7 @@ index 83b80b59..b189a19a 100644
         case LLM_ARCH_MINICPM:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -7249,6 +7342,55 @@ static bool llm_load_tensors(
+@@ -7249,6 +7341,55 @@ static bool llm_load_tensors(
                         layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
                     }
                 } break;
@ -286,7 +314,7 @@ index 83b80b59..b189a19a 100644
 +
 +                        auto & layer = model.layers[i];
 +
-+                        if (hparams.cross_attention_layer(i)) {
+                        if (hparams.cross_attention_layers(i)) {
 +                            layer.cross_attn_k_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128});
 +                            layer.cross_attn_k_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024});
 +                            layer.cross_attn_o_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd});
@ -317,7 +345,7 @@ index 83b80b59..b189a19a 100644
             case LLM_ARCH_GROK:
                 {
                     if (n_expert == 0) {
-@@ -9093,7 +9235,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+@@ -9093,7 +9234,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
             model.hparams.n_vocab != model.vocab.id_to_token.size()) {
@ -326,16 +354,7 @@ index 83b80b59..b189a19a 100644
         }
         if (params.vocab_only) {
-@@ -9178,7 +9320,7 @@ static struct ggml_tensor * llm_build_inp_embd(
+@@ -9193,6 +9334,21 @@ static struct ggml_tensor * llm_build_inp_embd(
         inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
     } else {
 -       lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
 +        lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
         inpL = lctx.inp_embd;
         ggml_set_input(lctx.inp_embd);
     }
@@ -9193,6 +9335,22 @@ static struct ggml_tensor * llm_build_inp_embd(
     return inpL;
 }
@ -346,11 +365,10 @@ index 83b80b59..b189a19a 100644
 +         const llm_build_cb & cb) {
 +    const int64_t n_embd = hparams.n_embd;
 +
-+    struct ggml_tensor * inpCAS;
+    struct ggml_tensor * inpCAS = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
-+    lctx.inp_cross_attn_state = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
+    cb(inpCAS, "inp_cross_attn_state", -1);
-+    cb(lctx.inp_cross_attn_state, "inp_cross_attn_state", -1);
+    ggml_set_input(inpCAS);
-+    ggml_set_input(lctx.inp_cross_attn_state);
+    lctx.inp_cross_attn_state = inpCAS;
 +    inpCAS = lctx.inp_cross_attn_state;
 +
 +    return inpCAS;
 +}
@ -358,7 +376,7 @@ index 83b80b59..b189a19a 100644
 static void llm_build_kv_store(
         struct ggml_context * ctx,
         const llama_hparams & hparams,
-@@ -10167,6 +10325,7 @@ struct llm_build_context {
+@@ -10167,6 +10323,7 @@ struct llm_build_context {
         lctx.inp_pos_bucket    = nullptr;
         lctx.inp_embd_enc      = nullptr;
         lctx.inp_KQ_mask_cross = nullptr;
@ -366,7 +384,7 @@ index 83b80b59..b189a19a 100644
     }
     void free() {
-@@ -10754,6 +10913,253 @@ struct llm_build_context {
+@@ -10754,6 +10911,239 @@ struct llm_build_context {
                 LLM_NORM_RMS, cb, -1);
         cb(cur, "result_norm", -1);
@ -410,8 +428,8 @@ index 83b80b59..b189a19a 100644
 +                    LLM_NORM_RMS, cb, il);
 +            cb(cur, "attn_norm", il);
 +
-+            if (hparams.cross_attention_layer(il)) {
+            if (hparams.cross_attention_layers(il)) {
-+                if (!lctx.cross_attn_state) {
+                if (!batch.embd && !cparams.cross_attn) {
 +                    continue;
 +                }
 +
@ -422,42 +440,28 @@ index 83b80b59..b189a19a 100644
 +                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 +                cb(Qcur, "Qcur", il);
 +
-+                Qcur = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
 +                cb(Qcur, "Qcur", il);
 +
 +                // TODO: is this required?
 +                Qcur = ggml_cont(ctx0, Qcur);
 +                cb(Qcur, "Qcur", il);
 +
 +                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
 +                cb(Qcur, "Qcur", il);
 +
-+                struct ggml_tensor * Kcur;
+                struct ggml_tensor * Kcur, * Vcur;
-+                if (lctx.cross_attn_state_first_pass) {
+                if (batch.embd) {
 +                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
 +                    cb(Kcur, "Kcur", il);
 +
 +                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
 +                    cb(Kcur, "Kcur", il);
 +
-+                    Kcur = ggml_permute(ctx0, Kcur, 0, 2, 1, 3);
+                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
 +                    cb(Kcur, "Kcur", il);
 +
 +                    // TODO: is this required?
 +                    Kcur = ggml_cont(ctx0, Kcur);
 +                    cb(Kcur, "Kcur", il);
 +
 +                    Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
 +                    cb(Kcur, "Kcur", il);
 +
 +                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self.k_l[il]));
 +                } else {
 +                    Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
 +                    cb(Kcur, "Kcur (view)", il);
 +                }
 +
 +                struct ggml_tensor * Vcur;
 +                if (lctx.cross_attn_state_first_pass) {
 +                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
 +                    cb(Vcur, "Vcur", il);
 +
@ -469,6 +473,9 @@ index 83b80b59..b189a19a 100644
 +
 +                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self.v_l[il]));
 +                } else {
 +                    Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
 +                    cb(Kcur, "Kcur (view)", il);
 +
 +                    Vcur = ggml_view_tensor(ctx0, kv_self.v_l[il]);
 +                    cb(Vcur, "Vcur (view)", il);
 +                }
@ -476,11 +483,8 @@ index 83b80b59..b189a19a 100644
 +                struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
 +                cb(kq, "kq", il);
 +
 +                kq = ggml_scale_inplace(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
 +                cb(kq, "kq_scaled", il);
 +
 +                // TODO: apply causal masks
-+                struct ggml_tensor * kq_soft_max = ggml_soft_max_inplace(ctx0, kq);
+                struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
 +                cb(kq_soft_max, "kq_soft_max", il);
 +
 +                Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
@ -620,7 +624,7 @@ index 83b80b59..b189a19a 100644
         // lm_head
         cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
-@@ -16501,6 +16907,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -16501,6 +16891,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_llama();
             } break;
@ -631,33 +635,48 @@ index 83b80b59..b189a19a 100644
         case LLM_ARCH_BAICHUAN:
             {
                 result = llm.build_baichuan();
-@@ -16773,6 +17183,14 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
+@@ -16761,10 +17155,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
         ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
     }
-+    // TODO (jmorganca): this might copy a lot of data on every request of a
+     if (batch.embd) {
-+    // single generation even though it doesn't change, so we should
+-        const int64_t n_embd   = hparams.n_embd;
-+    // find a way to not set this more than one time per image
+-        const int64_t n_tokens = batch.n_tokens;
-+    if (lctx.inp_cross_attn_state &&
+        if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
-+        lctx.inp_cross_attn_state->buffer) {
+            ggml_backend_tensor_set(lctx.inp_cross_attn_state, batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
-+        ggml_backend_tensor_set(lctx.inp_cross_attn_state, lctx.cross_attn_state, 0, hparams.n_embd * 1601 * 4 * ggml_element_size(lctx.inp_cross_attn_state));
+            // zero out inp_embd since it's not used
 +            float * inp_embd_data = (float *)lctx.inp_embd->data;
 +            for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
 +                inp_embd_data[i] = 0.0f;
 +            }
-+
+        } else {
-     if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
+            const int64_t n_embd   = hparams.n_embd;
-         GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
+            const int64_t n_tokens = batch.n_tokens;
         const int64_t n_tokens = batch.n_tokens;
@@ -17455,6 +17873,10 @@ static int llama_decode_internal(
-         llama_set_inputs(lctx, ubatch);
+-        ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
 +            ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
 +        }
     }
-+        // TODO: replace with something better to find out if its
+     if (batch.pos && lctx.inp_pos) {
-+        // our first actual pass
+@@ -17345,7 +17748,7 @@ static int llama_decode_internal(
-+        lctx.cross_attn_state_first_pass = false;
+         n_outputs = 1;
-+
+     }
         llama_graph_compute(lctx, gf, n_threads, threadpool);
-         // update the kv ring buffer
+-    lctx.sbatch.from_batch(batch_all, n_embd,
-@@ -18648,7 +19070,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+    lctx.sbatch.from_batch(batch_all, batch_all.n_embd,
         /* simple_split */ !kv_self.recurrent,
         /* logits_all   */ n_outputs == n_tokens_all);
@@ -17638,7 +18041,7 @@ static int llama_encode_internal(
     const int64_t n_embd = hparams.n_embd;
 -    lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
 +    lctx.sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
     const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
@@ -18648,7 +19051,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         if (llama_model_has_encoder(&model)) {
             n_attn_layer *= 3;
         }
@ -668,19 +687,7 @@ index 83b80b59..b189a19a 100644
     }
     size_t total_size_org = 0;
-@@ -19744,6 +20168,11 @@ struct llama_context * llama_new_context_with_model(
+@@ -19814,6 +20219,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
     return ctx;
 }
 +void llama_set_cross_attn_state(struct llama_context * ctx, float * cross_attn_state) {
 +    ctx->cross_attn_state_first_pass = true;
 +    ctx->cross_attn_state = cross_attn_state;
 +}
 +
 void llama_free(struct llama_context * ctx) {
     delete ctx;
 }
@@ -19814,6 +20243,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA:
@ -688,3 +695,38 @@ index 83b80b59..b189a19a 100644
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
         case LLM_ARCH_PLAMO:
@@ -21230,6 +21636,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
     ctx->cparams.causal_attn = causal_attn;
 }
 +void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
 +    ctx->cparams.cross_attn = cross_attention;
 +}
 +
 struct llama_batch llama_batch_get_one(
              llama_token * tokens,
                  int32_t   n_tokens,
@@ -21239,6 +21649,7 @@ struct llama_batch llama_batch_get_one(
         /*n_tokens       =*/ n_tokens,
         /*tokens         =*/ tokens,
         /*embd           =*/ nullptr,
 +        /*n_embd         =*/ 0,
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
@@ -21254,6 +21665,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
         /*n_tokens       =*/ 0,
         /*tokens         =*/ nullptr,
         /*embd           =*/ nullptr,
 +        /*n_embd         =*/ 0,
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
@@ -21265,6 +21677,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
 +        batch.n_embd = embd;
     } else {
         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
     }
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@ -2,7 +2,6 @@ package main
 import (
 	"errors"
 	"hash/maphash"
 	"log/slog"
 	"reflect"
 	"time"
@ -20,10 +19,6 @@ type InputCache struct {
 	// optimize cache eviction for multiple users
 	multiUserCache bool
 	// cache of images to embeddings
 	images    []imageCache
 	imageHash maphash.Hash
 	lc *llama.Context
 }
@ -41,7 +36,6 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
 		numCtx:         kvSize / numSlots,
 		slots:          slots,
 		multiUserCache: multiUserCache,
 		images:         make([]imageCache, numSlots),
 		lc:             lc,
 	}
 }
@ -211,55 +205,3 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int, numDiscar
 	}
 	slot.Inputs = slot.Inputs[:len(slot.Inputs)-numDiscard]
 }
 // Locking: Lookup and store operations on imageCache require a lock
 // to be held that serializes these with each other. Hash does not
 // require a lock nor they need to be serialized with InputCacheSlot.
 type imageCache struct {
 	key      uint64
 	val      [][]float32
 	lastUsed time.Time
 }
 func (c *InputCache) HashImage(image []byte) uint64 {
 	c.imageHash.Reset()
 	_, _ = c.imageHash.Write(image)
 	return c.imageHash.Sum64()
 }
 var ErrImageNotFound = errors.New("image not found in cache")
 func (c *InputCache) FindImage(hash uint64) ([][]float32, error) {
 	for i := range c.images {
 		if c.images[i].key == hash {
 			slog.Debug("loading image embeddings from cache", "entry", i)
 			c.images[i].lastUsed = time.Now()
 			return c.images[i].val, nil
 		}
 	}
 	return nil, ErrImageNotFound
 }
 func (c *InputCache) AddImage(hash uint64, embed [][]float32) {
 	best := time.Now()
 	var bestImage int
 	for i := range c.images {
 		if c.images[i].key == hash {
 			bestImage = i
 			break
 		}
 		if c.images[i].lastUsed.Compare(best) < 0 {
 			best = c.images[i].lastUsed
 			bestImage = i
 		}
 	}
 	slog.Debug("storing image embeddings in cache", "entry", bestImage, "used", c.images[bestImage].lastUsed)
 	c.images[bestImage].key = hash
 	c.images[bestImage].val = embed
 	c.images[bestImage].lastUsed = time.Now()
 }
--- a/llama/runner/cache_test.go
+++ b/llama/runner/cache_test.go
@ -1,7 +1,6 @@
 package main
 import (
 	"reflect"
 	"testing"
 	"time"
 )
@ -228,77 +227,3 @@ func TestFindCacheSlot(t *testing.T) {
 		})
 	}
 }
 func TestImageCache(t *testing.T) {
 	cache := NewInputCache(nil, 2048, 4, false)
 	valA := [][]float32{{0.1, 0.2}, {0.3}}
 	valB := [][]float32{{0.4}, {0.5}, {0.6}}
 	valC := [][]float32{{0.7}}
 	valD := [][]float32{{0.8}}
 	valE := [][]float32{{0.9}}
 	// Empty cache
 	result, err := cache.FindImage(0x5adb61d31933a946)
 	if err != ErrImageNotFound {
 		t.Errorf("found result in empty cache: result %v, err %v", result, err)
 	}
 	// Insert A
 	cache.AddImage(0x5adb61d31933a946, valA)
 	result, err = cache.FindImage(0x5adb61d31933a946)
 	if !reflect.DeepEqual(result, valA) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	// Insert B
 	cache.AddImage(0x011551369a34a901, valB)
 	result, err = cache.FindImage(0x5adb61d31933a946)
 	if !reflect.DeepEqual(result, valA) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	result, err = cache.FindImage(0x011551369a34a901)
 	if !reflect.DeepEqual(result, valB) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	// Replace B with C
 	cache.AddImage(0x011551369a34a901, valC)
 	result, err = cache.FindImage(0x5adb61d31933a946)
 	if !reflect.DeepEqual(result, valA) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	result, err = cache.FindImage(0x011551369a34a901)
 	if !reflect.DeepEqual(result, valC) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	// Evict A
 	cache.AddImage(0x756b218a517e7353, valB)
 	cache.AddImage(0x75e5e8d35d7e3967, valD)
 	cache.AddImage(0xd96f7f268ca0646e, valE)
 	result, err = cache.FindImage(0x5adb61d31933a946)
 	if reflect.DeepEqual(result, valA) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	result, err = cache.FindImage(0x756b218a517e7353)
 	if !reflect.DeepEqual(result, valB) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	result, err = cache.FindImage(0x011551369a34a901)
 	if !reflect.DeepEqual(result, valC) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	result, err = cache.FindImage(0x75e5e8d35d7e3967)
 	if !reflect.DeepEqual(result, valD) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	result, err = cache.FindImage(0xd96f7f268ca0646e)
 	if !reflect.DeepEqual(result, valE) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 }
--- a/llama/runner/image.go
+++ b/llama/runner/image.go
@ -0,0 +1,183 @@
 package main
 import (
 	"errors"
 	"fmt"
 	"hash/maphash"
 	"log/slog"
 	"slices"
 	"sync"
 	"time"
 	"github.com/ollama/ollama/llama"
 )
 const imageCacheSize = 4
 type ImageContext struct {
 	// mu is required to be held when generating embeddings or accessing the cache
 	mu sync.Mutex
 	clip   *llama.ClipContext
 	mllama *llama.MllamaContext
 	// cache of images to embeddings
 	images    []imageCache
 	imageHash maphash.Hash
 }
 func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageContext, error) {
 	arch, err := llama.GetModelArch(modelPath)
 	if err != nil {
 		return nil, fmt.Errorf("unable to determine vision architecture: %w (%s)", err, modelPath)
 	}
 	var c ImageContext
 	if arch == "clip" {
 		c.clip, err = llama.NewClipContext(llamaContext, modelPath)
 	} else if arch == "mllama" {
 		c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
 	} else {
 		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
 	}
 	if err != nil {
 		return nil, err
 	}
 	c.images = make([]imageCache, imageCacheSize)
 	return &c, nil
 }
 func (c *ImageContext) Free(modelPath string) {
 	if c == nil {
 		return
 	}
 	if c.clip != nil {
 		c.clip.Free()
 	}
 	if c.mllama != nil {
 		c.mllama.Free()
 	}
 }
 func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
 	if c == nil {
 		return nil, nil
 	}
 	if len(data) <= 0 {
 		return nil, errors.New("received zero length image")
 	}
 	hash := c.hashImage(data)
 	c.mu.Lock()
 	defer c.mu.Unlock()
 	embed, err := c.findImage(hash)
 	if err != nil {
 		if c.mllama != nil {
 			embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
 			if err != nil {
 				return nil, err
 			}
 		} else if c.clip != nil {
 			embed, err = c.clip.NewEmbed(llamaContext, data)
 			if err != nil {
 				return nil, err
 			}
 		} else {
 			return nil, errors.New("received image but vision model not loaded")
 		}
 		c.addImage(hash, embed)
 	}
 	return embed, nil
 }
 func (c *ImageContext) BatchSize(configuredBatchSize int) int {
 	// If images are not supported, we don't need to allocate embedding batches
 	if c == nil {
 		return 0
 	}
 	// Mllama maps an image to 1 embedding token (llava creates many tokens)
 	// and doesn't support more than a single image per request.
 	// The embeddings are large (100 MB), so allocating a big batch can fail
 	// on some systems
 	if c.mllama != nil {
 		return 1
 	}
 	return configuredBatchSize
 }
 func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
 	if c != nil && c.mllama != nil {
 		return c.mllama.EmbedSize(llamaContext)
 	} else {
 		return llamaContext.Model().NEmbd()
 	}
 }
 func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
 	if c == nil || c.mllama == nil {
 		return false
 	}
 	return slices.ContainsFunc(inputs, func(input input) bool {
 		return input.embed != nil
 	})
 }
 type imageCache struct {
 	key      uint64
 	val      [][]float32
 	lastUsed time.Time
 }
 func (c *ImageContext) hashImage(image []byte) uint64 {
 	c.imageHash.Reset()
 	_, _ = c.imageHash.Write(image)
 	return c.imageHash.Sum64()
 }
 var errImageNotFound = errors.New("image not found in cache")
 func (c *ImageContext) findImage(hash uint64) ([][]float32, error) {
 	for i := range c.images {
 		if c.images[i].key == hash {
 			slog.Debug("loading image embeddings from cache", "entry", i)
 			c.images[i].lastUsed = time.Now()
 			return c.images[i].val, nil
 		}
 	}
 	return nil, errImageNotFound
 }
 func (c *ImageContext) addImage(hash uint64, embed [][]float32) {
 	best := time.Now()
 	var bestImage int
 	for i := range c.images {
 		if c.images[i].key == hash {
 			bestImage = i
 			break
 		}
 		if c.images[i].lastUsed.Compare(best) < 0 {
 			best = c.images[i].lastUsed
 			bestImage = i
 		}
 	}
 	slog.Debug("storing image embeddings in cache", "entry", bestImage, "used", c.images[bestImage].lastUsed)
 	c.images[bestImage].key = hash
 	c.images[bestImage].val = embed
 	c.images[bestImage].lastUsed = time.Now()
 }
--- a/llama/runner/image_test.go
+++ b/llama/runner/image_test.go
@ -0,0 +1,80 @@
 package main
 import (
 	"reflect"
 	"testing"
 )
 func TestImageCache(t *testing.T) {
 	cache := ImageContext{images: make([]imageCache, 4)}
 	valA := [][]float32{{0.1, 0.2}, {0.3}}
 	valB := [][]float32{{0.4}, {0.5}, {0.6}}
 	valC := [][]float32{{0.7}}
 	valD := [][]float32{{0.8}}
 	valE := [][]float32{{0.9}}
 	// Empty cache
 	result, err := cache.findImage(0x5adb61d31933a946)
 	if err != errImageNotFound {
 		t.Errorf("found result in empty cache: result %v, err %v", result, err)
 	}
 	// Insert A
 	cache.addImage(0x5adb61d31933a946, valA)
 	result, err = cache.findImage(0x5adb61d31933a946)
 	if !reflect.DeepEqual(result, valA) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	// Insert B
 	cache.addImage(0x011551369a34a901, valB)
 	result, err = cache.findImage(0x5adb61d31933a946)
 	if !reflect.DeepEqual(result, valA) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	result, err = cache.findImage(0x011551369a34a901)
 	if !reflect.DeepEqual(result, valB) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	// Replace B with C
 	cache.addImage(0x011551369a34a901, valC)
 	result, err = cache.findImage(0x5adb61d31933a946)
 	if !reflect.DeepEqual(result, valA) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	result, err = cache.findImage(0x011551369a34a901)
 	if !reflect.DeepEqual(result, valC) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	// Evict A
 	cache.addImage(0x756b218a517e7353, valB)
 	cache.addImage(0x75e5e8d35d7e3967, valD)
 	cache.addImage(0xd96f7f268ca0646e, valE)
 	result, err = cache.findImage(0x5adb61d31933a946)
 	if reflect.DeepEqual(result, valA) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	result, err = cache.findImage(0x756b218a517e7353)
 	if !reflect.DeepEqual(result, valB) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	result, err = cache.findImage(0x011551369a34a901)
 	if !reflect.DeepEqual(result, valC) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	result, err = cache.findImage(0x75e5e8d35d7e3967)
 	if !reflect.DeepEqual(result, valD) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 	result, err = cache.findImage(0xd96f7f268ca0646e)
 	if !reflect.DeepEqual(result, valE) {
 		t.Errorf("failed to find expected value: result %v, err %v", result, err)
 	}
 }
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@ -52,6 +52,10 @@ type Sequence struct {
 	// input cache being used by this sequence
 	cache *InputCacheSlot
 	// does this sequence require cross-attention layers to be processed? - if we have seen
 	// an image for certain multi-modal models
 	crossAttention bool
 	// channel to send responses over
 	responses chan string
@ -127,7 +131,10 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 	var sc *llama.SamplingContext
 	if params.samplingParams != nil {
-		sc = llama.NewSamplingContext(s.model, *params.samplingParams)
+		sc, err = llama.NewSamplingContext(s.model, *params.samplingParams)
 		if err != nil {
 			return nil, err
 		}
 		for _, input := range inputs {
 			if input.embed == nil {
 				sc.Accept(input.token, false)
@ -190,16 +197,10 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 				return nil, fmt.Errorf("invalid image index: %d", n)
 			}
-			hash := s.cache.HashImage(images[imageIndex].Data)
+			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
 			// Vision models cannot be accessed concurrently
 			s.clip.mu.Lock()
 			embed, err := s.cache.FindImage(hash)
 			if err != nil {
-				embed = llama.NewLlavaImageEmbed(s.lc, s.clip.cc, images[imageIndex].Data)
+				return nil, err
 				s.cache.AddImage(hash, embed)
 			}
 			s.clip.mu.Unlock()
 			for _, e := range embed {
 				inputs = append(inputs, input{embed: e})
@ -207,41 +208,17 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 		}
 	}
 	if s.clip.cc != nil {
 		var embed [][]float32
 		if s.clip.cc.IsMllama && len(images) >= 1 {
 			hash := s.cache.HashImage(images[0].Data)
 			s.clip.mu.Lock()
 			var err error
 			embed, err = s.cache.FindImage(hash)
 			if err != nil {
 				embed = llama.NewMllamaImageEmbed(s.lc, s.clip.cc, images[0].Data, images[0].AspectRatioID)
 				s.cache.AddImage(hash, embed)
 			}
 			s.clip.mu.Unlock()
 		}
 		s.mu.Lock()
 		llama.MllamaSetCrossAttn(s.lc, s.clip.cc, embed)
 		s.mu.Unlock()
 	}
 	return inputs, nil
 }
 type clip struct {
 	cc *llama.ClipContext
 	mu sync.Mutex
 }
 type Server struct {
 	model *llama.Model
 	lc    *llama.Context
 	// required for image embeddings
-	clip clip
+	image *ImageContext
 	// TODO (jmorganca): make this n_batch
 	batchSize int
 	// parallel is the number of parallel requests to handle
@ -327,22 +304,31 @@ func (s *Server) removeSequence(seqIndex int, reason string) {
 	close(seq.responses)
 	close(seq.embedding)
 	seq.cache.InUse = false
 	if s.clip.cc != nil {
 		llama.MllamaSetCrossAttn(s.lc, s.clip.cc, nil)
 	}
 	s.seqs[seqIndex] = nil
 }
 func (s *Server) run(ctx context.Context) {
 	s.ready.Wait()
-	// logically these batches are used only within the context of processBatch
+	// Logically these batches are used only within the context of processBatch
 	// but it is better for performance to allocate them once here
-	tokenBatch := llama.NewBatch(s.batchSize*len(s.seqs), 0, len(s.seqs))
+	tokenBatch, err := llama.NewBatch(s.batchSize, len(s.seqs), 0)
 	if err != nil {
 		panic(err)
 	}
 	defer tokenBatch.Free()
-	embedBatch := llama.NewBatch(s.batchSize*len(s.seqs), s.lc.Model().NEmbd(), len(s.seqs))
+	var embedBatch *llama.Batch
 	embedBatchSize := s.image.BatchSize(s.batchSize)
 	if embedBatchSize != 0 {
 		embedBatch, err = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc))
 		if err != nil {
 			panic(err)
 		}
 		defer embedBatch.Free()
 	} else {
 		embedBatch = &llama.Batch{}
 	}
 	for {
 		select {
@ -371,6 +357,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 	defer s.mu.Unlock()
 	var batch *llama.Batch
 	crossAttention := false
 	seqIdx := s.nextSeq - 1
 	for range s.seqs {
@ -404,18 +391,19 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 					batch = tokenBatch
 				} else {
 					batch = embedBatch
 					seq.crossAttention = s.image.NeedCrossAttention(input)
 				}
-			} else if embedding != batch.IsEmbedding() {
+			} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
 				s.nextSeq = seqIdx
 				break
 			}
-			// todo: make this n_batch
+			if i >= batch.Size() {
 			if i >= s.batchSize {
 				break
 			}
-			batch.Add(input.token, input.embed, seq.numPast, []int{seq.cache.Id}, numInputsProcessed+1 == len(seq.inputs))
+			crossAttention = seq.crossAttention
 			batch.Add(input.token, input.embed, seq.numPast, numInputsProcessed+1 == len(seq.inputs), seq.cache.Id)
 			seq.numPast++
 			numInputsProcessed++
 		}
@ -431,6 +419,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return
 	}
 	s.lc.SetCrossAttention(crossAttention)
 	err := s.lc.Decode(batch)
 	if err != nil {
 		slog.Error("failed to decode batch", "error", err)
@ -648,6 +638,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
 				return
 			}
 			seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
 			s.seqs[i] = seq
 			s.cond.Signal()
 			break
@ -815,7 +808,7 @@ func (s *Server) loadModel(
 	if ppath != "" {
 		var err error
-		s.clip.cc, err = llama.NewClipContext(ppath)
+		s.image, err = NewImageContext(s.lc, ppath)
 		if err != nil {
 			panic(err)
 		}
@ -844,14 +837,8 @@ func main() {
 	mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
 	// Expose requirements as a JSON output to stdout
 	requirements := flag.Bool("requirements", false, "print json requirement information")
 	// These are either ignored by llama.cpp or have no significance to us
 	_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
 	_ = flag.Bool("log-disable", false, "disables logging to a file")
 	_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
 	flag.Parse()
 	if *requirements {
 		printRequirements(os.Stdout)
@ -874,7 +861,7 @@ func main() {
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.Info("starting go runner")
-	slog.Debug("system info", "cpu", llama.PrintSystemInfo(), "threads", *threads)
+	slog.Info("system", "info", llama.PrintSystemInfo(), "threads", *threads)
 	server := &Server{
 		batchSize: *batchSize,
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@ -5,6 +5,7 @@
 struct gpt_sampler *gpt_sampler_cinit(
    const struct llama_model *model, struct gpt_sampler_cparams *params)
 {
    try {
        gpt_sampler_params sparams;
        sparams.top_k = params->top_k;
        sparams.top_p = params->top_p;
@ -23,6 +24,9 @@ struct gpt_sampler *gpt_sampler_cinit(
        sparams.seed = params->seed;
        sparams.grammar = params->grammar;
        return gpt_sampler_init(model, sparams);
    } catch (const std::exception & err) {
        return nullptr;
    }
 }
 void gpt_sampler_cfree(struct gpt_sampler *sampler)
--- a/llama/vendoring
+++ b/llama/vendoring
@ -0,0 +1 @@
 LLAMACPP_BASE_COMMIT=3f1ae2e32cde00c39b96be6d01c2997c29bae555
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@ -1,15 +0,0 @@
 set(TARGET ollama_llama_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp utils.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
 target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
    target_link_options(${TARGET} PRIVATE -municode -Wl,/subsystem:console)
 endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/llm/ext_server/httplib.h
+++ b/llm/ext_server/httplib.h
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
--- a/llm/ext_server/utils.hpp
+++ b/llm/ext_server/utils.hpp
@ -1,661 +0,0 @@
 // MIT License
 // Copyright (c) 2023 Georgi Gerganov
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 // The above copyright notice and this permission notice shall be included in all
 // copies or substantial portions of the Software.
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 #pragma once
 #include <string>
 #include <vector>
 #include <set>
 #include <mutex>
 #include <condition_variable>
 #include <unordered_map>
 #include <random>
 #include <iostream>
 #include <thread>
 #include "json.hpp"
 #include "../llava/clip.h"
 using json = nlohmann::json;
 extern bool server_verbose;
 extern bool server_log_json;
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
 #if SERVER_VERBOSE != 1
 #define LOG_VERBOSE(MSG, ...)
 #else
 #define LOG_VERBOSE(MSG, ...)                                            \
    do                                                                   \
    {                                                                    \
        if (server_verbose)                                              \
        {                                                                \
            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
        }                                                                \
    } while (0)
 #endif
 #define LOG_ERROR(  MSG, ...) server_log("ERROR",  __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_DEBUG(  MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__)
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
    SERVER_STATE_ERROR           // An error occurred, load_model failed
 };
 enum task_type {
    TASK_TYPE_COMPLETION,
    TASK_TYPE_CANCEL,
    TASK_TYPE_NEXT_RESPONSE,
    TASK_TYPE_METRICS
 };
 struct task_server {
    int id = -1; // to be filled by llama_server_queue
    int target_id;
    task_type type;
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
    int multitask_id = -1;
 };
 struct task_result {
    int id;
    int multitask_id = -1;
    bool stop;
    bool error;
    json result_json;
 };
 struct task_multi {
    int id;
    std::set<int> subtasks_remaining{};
    std::vector<task_result> results{};
 };
 // completion token output with probabilities
 struct completion_token_output {
    struct token_prob
    {
        llama_token tok;
        float prob;
    };
    std::vector<token_prob> probs;
    llama_token tok;
    std::string text_to_send;
 };
 struct token_translator {
    llama_context * ctx;
    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
 };
 static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
    std::stringstream ss_tid;
    ss_tid << std::this_thread::get_id();
    json log = nlohmann::ordered_json{
        {"tid", ss_tid.str()},
        {"timestamp", time(nullptr)},
    };
    if (strncmp("DEBUG", level, strlen(level)) == 0 && !server_verbose) {
        return;
    }
    if (server_log_json) {
        log.merge_patch(
                {
                        {"level",     level},
                        {"function",  function},
                        {"line",      line},
                        {"msg",       message},
                });
        if (!extra.empty()) {
            log.merge_patch(extra);
        }
        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
    } else {
        if (!extra.empty()) {
            log.merge_patch(extra);
        }
        std::stringstream ss;
        ss << level << " [" << function << "] " << message << " |";
        for (const auto& el : log.items())
        {
            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
            ss << " " << el.key() << "=" << value;
        }
        const std::string str = ss.str();
        printf("%.*s\n", (int)str.size(), str.data());
        fflush(stdout);
    }
 }
 //
 // server utils
 //
 template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value) {
    // Fallback null to default value
    return body.contains(key) && !body.at(key).is_null()
        ? body.value(key, default_value)
        : default_value;
 }
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 inline bool verify_custom_template(const std::string & tmpl) {
    llama_chat_message chat[] = {{"user", "test"}};
    std::vector<char> buf(1);
    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
    return res >= 0;
 }
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
    size_t alloc_size = 0;
    // vector holding all allocated string to be passed to llama_chat_apply_template
    std::vector<std::string> str(messages.size() * 2);
    std::vector<llama_chat_message> chat(messages.size());
    for (size_t i = 0; i < messages.size(); ++i) {
        auto &curr_msg = messages[i];
        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
        alloc_size     += str[i*2 + 1].length();
        chat[i].role    = str[i*2 + 0].c_str();
        chat[i].content = str[i*2 + 1].c_str();
    }
    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
    std::vector<char> buf(alloc_size * 2);
    // run the first time to get the total output length
    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    // if it turns out that our buffer is too small, we resize it
    if ((size_t) res > buf.size()) {
        buf.resize(res);
        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    }
    std::string formatted_chat(buf.data(), res);
    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
    return formatted_chat;
 }
 //
 // work queue utils
 //
 struct llama_server_queue {
    int id = 0;
    std::mutex mutex_tasks;
    bool running;
    // queues
    std::vector<task_server> queue_tasks;
    std::vector<task_server> queue_tasks_deferred;
    std::vector<task_multi> queue_multitasks;
    std::condition_variable condition_tasks;
    // callback functions
    std::function<void(task_server&)> callback_new_task;
    std::function<void(task_multi&)> callback_finish_multitask;
    std::function<void(void)> callback_run_slots;
    // Add a new task to the end of the queue
    int post(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        if (task.id == -1) {
            task.id = id++;
            LOG_VERBOSE("new task id", {{"new_id", task.id}});
        }
        queue_tasks.push_back(std::move(task));
        condition_tasks.notify_one();
        return task.id;
    }
    // Add a new task, but defer until one slot is available
    void defer(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        queue_tasks_deferred.push_back(std::move(task));
    }
    // Get the next id for creating anew task
    int get_new_id() {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        int new_id = id++;
        LOG_VERBOSE("new task id", {{"new_id", new_id}});
        return new_id;
    }
    // Register function to process a new task
    void on_new_task(std::function<void(task_server&)> callback) {
        callback_new_task = callback;
    }
    // Register function to process a multitask when it is finished
    void on_finish_multitask(std::function<void(task_multi&)> callback) {
        callback_finish_multitask = callback;
    }
    // Register the function to be called when all slots data is ready to be processed
    void on_run_slots(std::function<void(void)> callback) {
        callback_run_slots = callback;
    }
    // Call when the state of one slot is changed
    void notify_slot_changed() {
        // move deferred tasks back to main loop
        std::unique_lock<std::mutex> lock(mutex_tasks);
        for (auto & task : queue_tasks_deferred) {
            queue_tasks.push_back(std::move(task));
        }
        queue_tasks_deferred.clear();
    }
    // end the start_loop routine
    void terminate() {
        {
            std::unique_lock<std::mutex> lock(mutex_tasks);
            running = false;
        }
        condition_tasks.notify_all();
    }
    /**
     * Main loop consists of these steps:
     * - Wait until a new task arrives
     * - Process the task (i.e. maybe copy data into slot)
     * - Check if multitask is finished
     * - Run all slots
     */
    void start_loop() {
        running = true;
        while (true) {
            LOG_VERBOSE("new task may arrive", {});
            {
                while (true)
                {
                    std::unique_lock<std::mutex> lock(mutex_tasks);
                    if (queue_tasks.empty()) {
                        lock.unlock();
                        break;
                    }
                    task_server task = queue_tasks.front();
                    queue_tasks.erase(queue_tasks.begin());
                    lock.unlock();
                    LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
                    callback_new_task(task);
                }
                LOG_VERBOSE("update_multitasks", {});
                // check if we have any finished multitasks
                auto queue_iterator = queue_multitasks.begin();
                while (queue_iterator != queue_multitasks.end())
                {
                    if (queue_iterator->subtasks_remaining.empty())
                    {
                        // all subtasks done == multitask is done
                        task_multi current_multitask = *queue_iterator;
                        callback_finish_multitask(current_multitask);
                        // remove this multitask
                        queue_iterator = queue_multitasks.erase(queue_iterator);
                    }
                    else
                    {
                        ++queue_iterator;
                    }
                }
                // all tasks in the current loop is processed, slots data is now ready
                LOG_VERBOSE("callback_run_slots", {});
                callback_run_slots();
            }
            LOG_VERBOSE("wait for new task", {});
            // wait for new task
            {
                std::unique_lock<std::mutex> lock(mutex_tasks);
                if (queue_tasks.empty()) {
                    if (!running) {
                        LOG_VERBOSE("ending start_loop", {});
                        return;
                    }
                    condition_tasks.wait(lock, [&]{
                        return (!queue_tasks.empty() || !running);
                    });
                }
            }
        }
    }
    //
    // functions to manage multitasks
    //
    // add a multitask by specifying the id of all subtask (subtask is a task_server)
    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        task_multi multi;
        multi.id = multitask_id;
        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
        queue_multitasks.push_back(multi);
    }
    // updatethe remaining subtasks, while appending results to multitask
    void update_multitask(int multitask_id, int subtask_id, task_result& result)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        for (auto& multitask : queue_multitasks)
        {
            if (multitask.id == multitask_id)
            {
                multitask.subtasks_remaining.erase(subtask_id);
                multitask.results.push_back(result);
            }
        }
    }
 };
 struct llama_server_response {
    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
    callback_multitask_t callback_update_multitask;
    // for keeping track of all tasks waiting for the result
    std::set<int> waiting_task_ids;
    // the main result queue
    std::vector<task_result> queue_results;
    std::mutex mutex_results;
    std::condition_variable condition_results;
    // add the task_id to the list of tasks waiting for response
    void add_waiting_task_id(int task_id) {
        LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.insert(task_id);
    }
    // when the request is finished, we can remove task associated with it
    void remove_waiting_task_id(int task_id) {
        LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.erase(task_id);
    }
    // This function blocks the thread until there is a response for this task_id
    task_result recv(int task_id) {
        while (true)
        {
            std::unique_lock<std::mutex> lock(mutex_results);
            condition_results.wait(lock, [&]{
                return !queue_results.empty();
            });
            for (int i = 0; i < (int) queue_results.size(); i++)
            {
                if (queue_results[i].id == task_id)
                {
                    assert(queue_results[i].multitask_id == -1);
                    task_result res = queue_results[i];
                    queue_results.erase(queue_results.begin() + i);
                    return res;
                }
            }
        }
        // should never reach here
    }
    // Register the function to update multitask
    void on_multitask_update(callback_multitask_t callback) {
        callback_update_multitask = callback;
    }
    // Send a new result to a waiting task_id
    void send(task_result result) {
        std::unique_lock<std::mutex> lock(mutex_results);
        LOG_VERBOSE("send new result", {{"task_id", result.id}});
        for (auto& task_id : waiting_task_ids) {
            // LOG_TEE("waiting task id %i \n", task_id);
            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
            if (result.multitask_id == task_id)
            {
                LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
                callback_update_multitask(task_id, result.id, result);
                continue;
            }
            if (result.id == task_id)
            {
                LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
                queue_results.push_back(result);
                condition_results.notify_all();
                return;
            }
        }
    }
 };
 //
 // base64 utils (TODO: move to common in the future)
 //
 static const std::string base64_chars =
             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
             "abcdefghijklmnopqrstuvwxyz"
             "0123456789+/";
 static inline bool is_base64(uint8_t c)
 {
    return (isalnum(c) || (c == '+') || (c == '/'));
 }
 static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 {
    int i = 0;
    int j = 0;
    int in_ = 0;
    int in_len = encoded_string.size();
    uint8_t char_array_4[4];
    uint8_t char_array_3[3];
    std::vector<uint8_t> ret;
    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
    {
        char_array_4[i++] = encoded_string[in_]; in_++;
        if (i == 4)
        {
            for (i = 0; i <4; i++)
            {
                char_array_4[i] = base64_chars.find(char_array_4[i]);
            }
            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
            for (i = 0; (i < 3); i++)
            {
                ret.push_back(char_array_3[i]);
            }
            i = 0;
        }
    }
    if (i)
    {
        for (j = i; j <4; j++)
        {
            char_array_4[j] = 0;
        }
        for (j = 0; j <4; j++)
        {
            char_array_4[j] = base64_chars.find(char_array_4[j]);
        }
        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
        for (j = 0; (j < i - 1); j++)
        {
            ret.push_back(char_array_3[j]);
        }
    }
    return ret;
 }
 //
 // random string / id
 //
 static std::string random_string()
 {
    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    std::random_device rd;
    std::mt19937 generator(rd());
    std::string result(32, ' ');
    for (int i = 0; i < 32; ++i) {
        result[i] = str[generator() % str.size()];
    }
    return result;
 }
 static std::string gen_chatcmplid()
 {
    std::stringstream chatcmplid;
    chatcmplid << "chatcmpl-" << random_string();
    return chatcmplid.str();
 }
 //
 // other common utils
 //
 static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
 {
    size_t i;
    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
    {
    }
    return i;
 }
 static bool ends_with(const std::string &str, const std::string &suffix)
 {
    return str.size() >= suffix.size() &&
           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
 static size_t find_partial_stop_string(const std::string &stop,
                                       const std::string &text)
 {
    if (!text.empty() && !stop.empty())
    {
        const char text_last_char = text.back();
        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
        {
            if (stop[char_index] == text_last_char)
            {
                const std::string current_partial = stop.substr(0, char_index + 1);
                if (ends_with(text, current_partial))
                {
                    return text.size() - char_index - 1;
                }
            }
        }
    }
    return std::string::npos;
 }
 // TODO: reuse llama_detokenize
 template <class Iter>
 static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
 {
    std::string ret;
    for (; begin != end; ++begin)
    {
        ret += llama_token_to_piece(ctx, *begin);
    }
    return ret;
 }
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
    // if the size is 1 and first bit is 1, meaning it's a partial character
    //   (size > 1 meaning it's already a known token)
    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
    {
        std::stringstream ss;
        ss << std::hex << (out[0] & 0xff);
        std::string res(ss.str());
        out = "byte: \\x" + res;
    }
    return out;
 }
 // convert a vector of completion_token_output to json
 static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
 {
    json out = json::array();
    for (const auto &prob : probs)
    {
        json probs_for_token = json::array();
        for (const auto &p : prob.probs)
        {
            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
            probs_for_token.push_back(json
            {
                {"tok_str", tok_str},
                {"prob",    p.prob},
            });
        }
        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
        out.push_back(json{
            {"content", tok_str},
            {"probs",   probs_for_token},
        });
    }
    return out;
 }
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@ -1,137 +0,0 @@
 # common logic across linux and darwin
 init_vars() {
    case "${GOARCH}" in
    "amd64")
        ARCH="x86_64"
        ;;
    "arm64")
        ARCH="arm64"
        ;;
    *)
        echo "GOARCH must be set"
        echo "this script is meant to be run from within go generate"
        exit 1
        ;;
    esac
    LLAMACPP_DIR=../llama.cpp
    CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
    CMAKE_TARGETS="--target ollama_llama_server"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
    else
        # TODO - add additional optimization flags...
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
    fi
    case $(uname -s) in
    "Darwin")
        LIB_EXT="dylib"
        WHOLE_ARCHIVE="-Wl,-force_load"
        NO_WHOLE_ARCHIVE=""
        GCC_ARCH="-arch ${ARCH}"
        DIST_BASE=../../dist/darwin-${GOARCH}/
        PAYLOAD_BASE=../../build/darwin/${GOARCH}
        ;;
    "Linux")
        LIB_EXT="so"
        WHOLE_ARCHIVE="-Wl,--whole-archive"
        NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
        # Cross compiling not supported on linux - Use docker
        GCC_ARCH=""
        DIST_BASE=../../dist/linux-${GOARCH}/
        PAYLOAD_BASE=../../build/linux/${GOARCH}
        ;;
    *)
        ;;
    esac
    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
    fi
    GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
    RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
 }
 git_module_setup() {
    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
        echo "Skipping submodule initialization"
        return
    fi
    # Make sure the tree is clean after the directory moves
    if [ -d "${LLAMACPP_DIR}/gguf" ]; then
        echo "Cleaning up old submodule"
        rm -rf ${LLAMACPP_DIR}
    fi
    git submodule init
    git submodule update --force ${LLAMACPP_DIR}
 }
 apply_patches() {
    # apply temporary patches until fix is upstream
    for patch in ../patches/*.patch; do
        git -c 'user.name=nobody' -c 'user.email=<>' -C ${LLAMACPP_DIR} am ${patch}
    done
 }
 build() {
    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
    # remove unnecessary build artifacts
    rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
 }
 dist() {
    [ -z "${RUNNER}" ] && exit 1
    mkdir -p ${RUNNER_BASE}/${RUNNER}/
    for f in ${BUILD_DIR}/bin/* ; do
        cp ${f} ${RUNNER_BASE}/${RUNNER}/
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
            cp ${f} ${RUNNER_BASE}/${RUNNER}/
        done
    fi
 }
 # Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
 compress() {
    [ -z "${RUNNER}" ] && exit 1
    echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
    rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
    mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
    for f in ${BUILD_DIR}/bin/* ; do
        ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
        compress_pids+=" $!"
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
            ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
            compress_pids+=" $!"
        done
    fi
    echo
 }
 wait_for_compress() {
    for pid in ${compress_pids}; do
        wait $pid
    done
    echo "Finished compression"
 }
 install() {
    echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do
        rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
        cp -af "${lib}" "${BUILD_DIR}/bin/"
    done
 }
 # Keep the local tree clean after we're done with the build
 cleanup() {
    git submodule update --force ${LLAMACPP_DIR}
 }
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@ -1,91 +0,0 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
 # working directory must be ./llm/generate/
 # TODO - add hardening to detect missing tools (cmake, etc.)
 set -ex
 set -o pipefail
 compress_pids=""
 echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
 apply_patches
 sign() {
    if [ -n "$APPLE_IDENTITY" ]; then
        codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1
    fi
 }
 COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
 case "${GOARCH}" in
 "amd64")
    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DGGML_METAL=off -DGGML_NATIVE=off"
    if [ -z "$OLLAMA_SKIP_CPU_GENERATE" ]; then
        #
        # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
        RUNNER=cpu
        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
        echo "Building LCD CPU"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
        compress
        #
        # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
        # Approximately 400% faster than LCD on same CPU
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
        RUNNER=cpu_avx
        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
        echo "Building AVX CPU"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
        compress
        #
        # ~2013 CPU Dynamic library
        # Approximately 10% faster than AVX on same CPU
        #
        init_vars
        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
        RUNNER=cpu_avx2
        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
        echo "Building AVX2 CPU"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
        compress
    fi
    ;;
 "arm64")
    if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
        init_vars
        CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
        RUNNER="metal"
        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
        build
        sign ${BUILD_DIR}/bin/ollama_llama_server
        compress
    fi
    ;;
 *)
    echo "GOARCH must be set"
    echo "this script is meant to be run from within go generate"
    exit 1
    ;;
 esac
 cleanup
 wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -1,285 +0,0 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
 # working directory must be llm/generate/
 # First we build one or more CPU based LLM libraries
 #
 # Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
 # library dependencies
 #
 # Then if we detect ROCm, we build a dynamically loaded ROCm lib.  The ROCM
 # libraries are quite large, and also dynamically load data files at runtime
 # which in turn are large, so we don't attempt to cary them as payload
 set -ex
 set -o pipefail
 compress_pids=""
 # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
 amdGPUs() {
    if [ -n "${AMDGPU_TARGETS}" ]; then
        echo "${AMDGPU_TARGETS}"
        return
    fi
    GPU_LIST=(
        "gfx900"
        "gfx906:xnack-"
        "gfx908:xnack-"
        "gfx90a:xnack+"
        "gfx90a:xnack-"
        "gfx940"
        "gfx941"
        "gfx942"
        "gfx1010"
        "gfx1012"
        "gfx1030"
        "gfx1100"
        "gfx1101"
        "gfx1102"
    )
    (
        IFS=$';'
        echo "'${GPU_LIST[*]}'"
    )
 }
 echo "Starting linux generate script"
 if [ -z "${CUDACXX}" ]; then
    if [ -x /usr/local/cuda/bin/nvcc ]; then
        export CUDACXX=/usr/local/cuda/bin/nvcc
    else
        # Try the default location in case it exists
        export CUDACXX=$(command -v nvcc)
    fi
 fi
 COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
 apply_patches
 init_vars
 if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
    # Users building from source can tune the exact flags we pass to cmake for configuring
    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
        init_vars
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
        RUNNER="cpu"
        BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
        echo "Building custom CPU"
        build
        install
        dist
        compress
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
        # -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
        # -DGGML_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
        # -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
        # -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
        # Note: the following seem to yield slower results than AVX2 - ymmv
        # -DGGML_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
        # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
        # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
            #
            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
            #
            init_vars
            CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
            RUNNER=cpu
            BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
            echo "Building LCD CPU"
            build
            install
            dist
            compress
        fi
        if [ "${ARCH}" == "x86_64" ]; then
            #
            # ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
            #
            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
                #
                # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
                # Approximately 400% faster than LCD on same CPU
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
                RUNNER=cpu_avx
                BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
                echo "Building AVX CPU"
                build
                install
                dist
                compress
            fi
            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
                #
                # ~2013 CPU Dynamic library
                # Approximately 10% faster than AVX on same CPU
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
                RUNNER=cpu_avx2
                BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
                echo "Building AVX2 CPU"
                build
                install
                dist
                compress
            fi
        fi
    fi
 else
    echo "Skipping CPU generation step as requested"
 fi
 # If needed, look for the default CUDA toolkit location
 if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
    CUDA_LIB_DIR=/usr/local/cuda/lib64
 fi
 # If needed, look for CUDA on Arch Linux
 if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
    CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
 fi
 # Allow override in case libcudart is in the wrong place
 if [ -z "${CUDART_LIB_DIR}" ]; then
    CUDART_LIB_DIR="${CUDA_LIB_DIR}"
 fi
 if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
    echo "CUDA libraries detected - building dynamic CUDA library"
    init_vars
    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
    if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
    if [ "${ARCH}" == "arm64" ]; then
        echo "ARM CPU detected - disabling unsupported AVX instructions"
        # ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
        #
        # CUDA compute < 6.0 lacks proper FP16 support on ARM.
        # Disabling has minimal performance effect while maintaining compatibility.
        ARM64_DEFS="-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_CUDA_F16=off"
    fi
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
    if [ -n "${OLLAMA_CUSTOM_CUDA_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_CUDA_DEFS=\"${OLLAMA_CUSTOM_CUDA_DEFS}\""
        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
        echo "Building custom CUDA GPU"
    else
        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
    fi
    export CUDAFLAGS="-t8"
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
    RUNNER=cuda${CUDA_VARIANT}
    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
    build
    install
    dist
    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
    mkdir -p "${CUDA_DIST_DIR}"
    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
        cp -a "${lib}" "${CUDA_DIST_DIR}"
    done
    compress
 fi
 if [ -z "${ONEAPI_ROOT}" ]; then
    # Try the default location in case it exists
    ONEAPI_ROOT=/opt/intel/oneapi
 fi
 if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
    echo "OneAPI libraries detected - building dynamic OneAPI library"
    init_vars
    source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
    CC=icx
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
    RUNNER=oneapi
    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
    build
    # copy oneAPI dependencies
    mkdir -p "${ONEAPI_DIST_DIR}"
    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
        cp -a "${dep}" "${ONEAPI_DIST_DIR}"
    done
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
    install
    dist
    compress
 fi
 if [ -z "${ROCM_PATH}" ]; then
    # Try the default location in case it exists
    ROCM_PATH=/opt/rocm
 fi
 if [ -z "${CLBlast_DIR}" ]; then
    # Try the default location in case it exists
    if [ -d /usr/lib/cmake/CLBlast ]; then
        export CLBlast_DIR=/usr/lib/cmake/CLBlast
    fi
 fi
 if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
    echo "ROCm libraries detected - building dynamic ROCm library"
    if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
    fi
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
    if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
        CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
        echo "Building custom ROCM GPU"
    fi
    RUNNER=rocm${ROCM_VARIANT}
    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
    # ROCm dependencies are too large to fit into a unified bundle
    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
    # TODO figure out how to disable runpath (rpath)
    # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
    export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build
    # copy the ROCM dependencies
    mkdir -p "${ROCM_DIST_DIR}"
    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
        cp -a "${dep}"* "${ROCM_DIST_DIR}"
        if [ $(readlink -f "${dep}") != "${dep}" ] ; then
            cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
        fi
    done
    install
    dist
    compress
 fi
 cleanup
 wait_for_compress
 echo "go generate completed.  LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -1,403 +0,0 @@
 #!powershell
 $ErrorActionPreference = "Stop"
 function amdGPUs {
    if ($env:AMDGPU_TARGETS) {
        return $env:AMDGPU_TARGETS
    }
    # Current supported rocblas list from ROCm v6.1.2 on windows
    # https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus
    $GPU_LIST = @(
        "gfx1030"
        "gfx1100"
        "gfx1101"
        "gfx1102"
    )
    $GPU_LIST -join ';'
 }
 function init_vars {
    write-host "Checking for cmake..."
    get-command cmake
    write-host "Checking for ninja..."
    $d=(get-command -ea 'silentlycontinue' ninja).path
    if ($null -eq $d) {
        $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
        $matches=(gci -path $MSVC_INSTALL -r -fi ninja.exe)
        if ($matches.count -eq 0) {
            throw "Unable to locate ninja"
        }
        $ninjaDir=($matches[0].FullName | split-path -parent)
        $env:PATH="$env:PATH;$ninjaDir"
    }
    if (!$script:SRC_DIR) {
        $script:SRC_DIR = $(resolve-path "..\..\")
    }
    if (!$script:llamacppDir) {
        $script:llamacppDir = "../llama.cpp"
    }
    if (!$script:cmakeTargets) {
        $script:cmakeTargets = @("ollama_llama_server")
    }
    $script:cmakeDefs = @(
        "-DBUILD_SHARED_LIBS=on",
        "-DGGML_NATIVE=off",
        "-DGGML_OPENMP=off"
        )
    $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
    md "$script:DIST_BASE" -ea 0 > $null
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
        $script:config = "RelWithDebInfo"
    } else {
        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off", "-DCMAKE_BUILD_TYPE=Release")
        $script:config = "Release"
    }
    if ($null -ne $env:CMAKE_SYSTEM_VERSION) {
        $script:cmakeDefs += @("-DCMAKE_SYSTEM_VERSION=${env:CMAKE_SYSTEM_VERSION}")
    }
    # Try to find the CUDA dir
    if ($env:CUDA_LIB_DIR -eq $null) {
        $d=(get-command -ea 'silentlycontinue' nvcc).path
        if ($d -ne $null) {
            $script:CUDA_LIB_DIR=($d| split-path -parent)
            $script:CUDA_INCLUDE_DIR=($script:CUDA_LIB_DIR|split-path -parent)+"\include"
        }
    } else {
        $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
    }
    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
    if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
        $script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
    } else {
        $script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
    }
    # Note: Windows Kits 10 signtool crashes with GCP's plugin
    if ($null -eq $env:SIGN_TOOL) {
        ${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
    } else {
        ${script:SignTool}=${env:SIGN_TOOL}
    }
    if ("${env:KEY_CONTAINER}") {
        ${script:OLLAMA_CERT}=$(resolve-path "${script:SRC_DIR}\ollama_inc.crt")
    }
 }
 function git_module_setup {
    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
    & git submodule init
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    & git submodule update --force "${script:llamacppDir}"
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }
 function apply_patches {
    # Apply temporary patches until fix is upstream
    foreach ($patch in $(Get-ChildItem "../patches/*.patch")) {
        git -c 'user.name=nobody' -c 'user.email=<>' -C "${script:llamacppDir}" am $patch.FullName
    }
 }
 function build {
    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
    & cmake --version
    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ($cmakeDefs -contains "-G") {
        $extra=@("-j8")
    } else {
        $extra= @("--", "/maxCpuCount:8")
    }
    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    # Rearrange output to be consistent between different generators
    if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
        mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
        remove-item "${script:buildDir}/bin/${script:config}"
    }
 }
 function sign {
    if ("${env:KEY_CONTAINER}") {
        write-host "Signing ${script:buildDir}/bin/*.exe  ${script:buildDir}/bin/*.dll"
        foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
    }
 }
 function install {
    write-host "Installing binaries to dist dir ${script:distDir}"
    mkdir ${script:distDir} -ErrorAction SilentlyContinue
    $binaries = dir "${script:buildDir}/bin/*.exe"
    foreach ($file in $binaries) {
        copy-item -Path $file -Destination ${script:distDir} -Force
    }
    write-host "Installing dlls to dist dir ${script:distDir}"
    $dlls = dir "${script:buildDir}/bin/*.dll"
    foreach ($file in $dlls) {
        copy-item -Path $file -Destination ${script:distDir} -Force
    }
 }
 function cleanup {
    $patches = Get-ChildItem "../patches/*.diff"
    foreach ($patch in $patches) {
        # Extract file paths from the patch file
        $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
            $parts = $_ -split ' '
            ($parts[1] -split '/', 2)[1]
        }
        # Checkout each file
        foreach ($file in $filePaths) {
            git -C "${script:llamacppDir}" checkout $file
        }
        git -C "${script:llamacppDir}" checkout CMakeLists.txt
    }
 }
 # -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
 # -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
 # -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
 function build_cpu_x64 {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
        init_vars
        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=off", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu"
        $script:distDir="$script:DIST_BASE\cpu"
        write-host "Building LCD CPU"
        build
        sign
        install
    } else {
        write-host "Skipping CPU generation step as requested"
    }
 }
 function build_cpu_arm64 {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
        init_vars
        write-host "Checking for clang..."
        get-command clang
        $env:CFLAGS="-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only"
        $env:CXXFLAGS="$env:CFLAGS"
        $env:LDFLAGS="-static-libstdc++"
        $script:cmakeDefs = $script:commonCpuDefs + @(
            "-DCMAKE_VERBOSE_MAKEFILE=on",
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DMSVC_RUNTIME_LIBRARY=MultiThreaded"
        ) + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu"
        $script:distDir="$script:DIST_BASE\cpu"
        write-host "Building LCD CPU"
        build
        sign
        install
    } else {
        write-host "Skipping CPU generation step as requested"
    }
 }
 function build_cpu_avx() {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
        init_vars
        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=on", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
        $script:distDir="$script:DIST_BASE\cpu_avx"
        write-host "Building AVX CPU"
        build
        sign
        install
    } else {
        write-host "Skipping CPU AVX generation step as requested"
    }
 }
 function build_cpu_avx2() {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
        init_vars
        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=on", "-DGGML_AVX2=on", "-DGGML_AVX512=off", "-DGGML_FMA=on", "-DGGML_F16C=on") + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
        $script:distDir="$script:DIST_BASE\cpu_avx2"
        write-host "Building AVX2 CPU"
        build
        sign
        install
    } else {
        write-host "Skipping CPU AVX2 generation step as requested"
    }
 }
 function build_cuda() {
    if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
        # Then build cuda as a dynamically loaded library
        $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
        $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
        if ($null -ne $script:CUDA_VERSION) {
            $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
        }
        init_vars
        $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
        $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
        $script:cmakeDefs += @(
            "-A", "x64",
            "-DGGML_CUDA=ON",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
            "-DCMAKE_CUDA_FLAGS=-t6",
            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
            "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
            )
        if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
            write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
            $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
            write-host "building custom CUDA GPU"
        }
        build
        sign
        install
        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    } else {
        write-host "Skipping CUDA generation step"
    }
 }
 function build_oneapi() {
  if ((-not "${env:OLLAMA_SKIP_ONEAPI_GENERATE}") -and ("${env:ONEAPI_ROOT}"))  {
    # Get oneAPI version
    $script:ONEAPI_VERSION = icpx --version
    $script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?<version>\d+\.\d+\.\d+)').Value
    if ($null -ne $script:ONEAPI_VERSION) {
      $script:ONEAPI_VARIANT = "_v" + $script:ONEAPI_VERSION
    }
    init_vars
    $script:buildDir = "../build/windows/${script:ARCH}/oneapi$script:ONEAPI_VARIANT"
    $script:distDir ="$script:DIST_BASE\oneapi$script:ONEAPI_VARIANT"
    $script:cmakeDefs += @(
      "-G", "MinGW Makefiles",
      "-DGGML_SYCL=ON",
      "-DCMAKE_C_COMPILER=icx",
      "-DCMAKE_CXX_COMPILER=icx",
      "-DCMAKE_BUILD_TYPE=Release"
    )
    Write-Host "Building oneAPI"
    build
    # Ninja doesn't prefix with config name
    if ($null -ne $script:DUMPBIN) {
      & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | Select-String ".dll"
    }
    sign
    install
    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
  } else {
    Write-Host "Skipping oneAPI generation step"
  }
 }
 function build_rocm() {
    if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
        $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
        if ($null -ne $script:ROCM_VERSION) {
            $script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
        }
        init_vars
        $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
        $script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
        $script:cmakeDefs += @(
            "-G", "Ninja",
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DGGML_HIPBLAS=on",
            "-DHIP_PLATFORM=amd",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
            "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
            "-DAMDGPU_TARGETS=$(amdGPUs)",
            "-DGPU_TARGETS=$(amdGPUs)"
            )
        # Make sure the ROCm binary dir is first in the path
        $env:PATH="$env:HIP_PATH\bin;$env:PATH"
        # We have to clobber the LIB var from the developer shell for clang to work properly
        $env:LIB=""
        if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
            write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
            $script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
            write-host "building custom ROCM GPU"
        }
        write-host "Building ROCm"
        build
        # Ninja doesn't prefix with config name
        ${script:config}=""
        if ($null -ne $script:DUMPBIN) {
            & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
        }
        sign
        install
        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
        # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
    } else {
        write-host "Skipping ROCm generation step"
    }
 }
 init_vars
 if ($($args.count) -eq 0) {
    git_module_setup
    apply_patches
    if ($script:ARCH -eq "arm64") {
        build_cpu_arm64
    } else { # amd64
        build_cpu_x64
        build_cpu_avx
        build_cpu_avx2
        build_cuda
        build_oneapi
        build_rocm
    }
    cleanup
    write-host "`ngo generate completed.  LLM runners: $(get-childitem -path $script:DIST_BASE)"
 } else {
    for ( $i = 0; $i -lt $args.count; $i++ ) {
        write-host "performing $($args[$i])"
        & $($args[$i])
    }
 }
--- a/llm/generate/generate_darwin.go
+++ b/llm/generate/generate_darwin.go
@ -1,3 +0,0 @@
 package generate
 //go:generate bash ./gen_darwin.sh
--- a/llm/generate/generate_linux.go
+++ b/llm/generate/generate_linux.go
@ -1,3 +0,0 @@
 package generate
 //go:generate bash ./gen_linux.sh
--- a/llm/generate/generate_windows.go
+++ b/llm/generate/generate_windows.go
@ -1,3 +0,0 @@
 package generate
 //go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
--- a/llm/ggml.go
+++ b/llm/ggml.go
@ -360,7 +360,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	}, offset, nil
 }
-func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
+func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffload uint64) {
 	embedding := llm.KV().EmbeddingLength()
 	heads := llm.KV().HeadCount()
 	headsKV := llm.KV().HeadCountKV()
@ -368,9 +368,12 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 	embeddingHeads := llm.KV().EmbeddingHeadCount()
 	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
 	embeddingHeadsV := llm.KV().EmbeddingHeadCountV()
 	layers := llm.Tensors().Layers()
 	kv = 2 * context * llm.KV().BlockCount() * (embeddingHeadsK + embeddingHeadsV) * headsKV
 	switch llm.KV().Architecture() {
 	case "llama":
 		fullOffload = max(
@ -400,6 +403,42 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
 			)
 		}
 	case "mllama":
 		var visionTokens, tiles uint64 = 1601, 4
 		if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
 			kv = headsKV *
 				(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
 				(2* // sizeof(float16)
 					(llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
 					context +
 					4* // sizeof(float32)
 						uint64(crossAttentionLayers.size)* // num cross attention layers
 						visionTokens*
 						tiles)
 		}
 		fullOffload = max(
 			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
 			// vocab graph
 			4*batch*(embedding+vocab),
 		)
 		var ropeFreqsCount uint64
 		if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok {
 			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
 				ropeFreqsCount = ropeFreqsWeights.parameters()
 			}
 		}
 		partialOffload = max(
 			4*(batch*
 				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
 				ropeFreqsCount+
 				embeddingHeadsK*context*headsKV),
 			// vocab graph
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
 	case "gemma", "gemma2":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@ -1 +0,0 @@
 Subproject commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555
--- a/llm/memory.go
+++ b/llm/memory.go
@ -123,13 +123,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		slog.Warn("model missing blk.0 layer size")
 	}
-	// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
+	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
 	var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
 	// KV is proportional to the number of layers
 	layerSize += kv / ggml.KV().BlockCount()
 	graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
 	if graphPartialOffload == 0 {
 		graphPartialOffload = ggml.KV().GQA() * kv / 6
 	}
@ -137,6 +131,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		graphFullOffload = graphPartialOffload
 	}
 	// KV is proportional to the number of layers
 	layerSize += kv / ggml.KV().BlockCount()
 	// on metal there's no partial offload overhead
 	if gpus[0].Library == "metal" {
 		graphPartialOffload = graphFullOffload
--- a/llm/patches/0000-cmakelist.patch
+++ b/llm/patches/0000-cmakelist.patch
@ -1,22 +0,0 @@
 From 7a3555098d4591c9b329c677654497ed8cee07ec Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Fri, 23 Aug 2024 11:27:48 -0700
 Subject: [PATCH] patch cmakelist
 ---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)
 diff --git a/CMakeLists.txt b/CMakeLists.txt
 index 415743c2..aaadd13e 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
@@ -210,3 +210,5 @@ if (LLAMA_BUILD_EXAMPLES)
     add_subdirectory(examples)
     add_subdirectory(pocs)
 endif()
 +
 +add_subdirectory(../ext_server ext_server) # ollama
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0001-load-progress.patch
+++ b/llm/patches/0001-load-progress.patch
@ -1,44 +0,0 @@
 From c97ed60c3369294d5551ba099a88ddc509687df1 Mon Sep 17 00:00:00 2001
 From: Gabe Goodhart <ghart@us.ibm.com>
 Date: Thu, 19 Sep 2024 16:55:15 -0600
 Subject: [PATCH] patch load progress
 ---
 common/common.cpp | 2 ++
 common/common.h   | 7 +++++++
 2 files changed, 9 insertions(+)
 diff --git a/common/common.cpp b/common/common.cpp
 index 8d0ed4f9..a09e8a53 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
@@ -955,6 +955,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
 +    mparams.progress_callback = params.progress_callback;
 +    mparams.progress_callback_user_data = params.progress_callback_user_data;
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
     } else {
 diff --git a/common/common.h b/common/common.h
 index cb87c447..818a4a4a 100644
 --- a/common/common.h
 +++ b/common/common.h
@@ -266,6 +266,13 @@ struct gpt_params {
     std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
     std::vector<std::string> image; // path to image file(s)
 +    // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
 +    // If the provided progress_callback returns true, model loading continues.
 +    // If it returns false, model loading is immediately aborted.
 +    llama_progress_callback progress_callback = NULL;
 +    // context pointer passed to the progress callback
 +    void * progress_callback_user_data;
 +
     // embedding
     bool embedding         = false; // get only sentence embedding
     int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0002-clip-log.patch
+++ b/llm/patches/0002-clip-log.patch
@ -1,24 +0,0 @@
 From 6fdf4268e13e56f0050fa6a29b029cbd54be49d2 Mon Sep 17 00:00:00 2001
 From: Gabe Goodhart <ghart@us.ibm.com>
 Date: Thu, 19 Sep 2024 16:58:03 -0600
 Subject: [PATCH] clip log
 ---
 examples/llava/clip.cpp | 1 +
 1 file changed, 1 insertion(+)
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
 index 8aa7b075..b8941c74 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
 +#include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0003-load_exception.patch
+++ b/llm/patches/0003-load_exception.patch
@ -1,57 +0,0 @@
 From 4f2b9cd0f012c49f40d0784454864ad41ca418b2 Mon Sep 17 00:00:00 2001
 From: Gabe Goodhart <ghart@us.ibm.com>
 Date: Thu, 19 Sep 2024 17:00:28 -0600
 Subject: [PATCH] load exception
 ---
 src/llama.cpp | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)
 diff --git a/src/llama.cpp b/src/llama.cpp
 index af8afd84..4d1db3d5 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -8871,7 +8871,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         }
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
 -        return -1;
 +        throw;
     }
     // loading time will be recalculate after the first eval, so
@@ -18675,16 +18675,23 @@ struct llama_model * llama_load_model_from_file(
         }
         model->rpc_servers.push_back(servers);
     }
 -    int status = llama_model_load(path_model, *model, params);
 -    GGML_ASSERT(status <= 0);
 -    if (status < 0) {
 -        if (status == -1) {
 -            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
 -        } else if (status == -2) {
 -            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
 +
 +    try {
 +        int status = llama_model_load(path_model, *model, params);
 +        GGML_ASSERT(status <= 0);
 +        if (status < 0) {
 +            if (status == -1) {
 +                LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
 +            } else if (status == -2) {
 +                LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
 +            }
 +            delete model;
 +            return nullptr;
         }
 +    } catch (...) {
 +        LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
         delete model;
 -        return nullptr;
 +        throw;
     }
     return model;
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0004-metal.patch
+++ b/llm/patches/0004-metal.patch
@ -1,57 +0,0 @@
 From 91d3f886f1645b38d9658c0e125603e8d5338146 Mon Sep 17 00:00:00 2001
 From: nobody <>
 Date: Tue, 1 Oct 2024 13:55:01 -0600
 Subject: [PATCH] metal
 ---
 ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)
 diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
 index 9da08fe2..3a433703 100644
 --- a/ggml/src/ggml-metal.m
 +++ b/ggml/src/ggml-metal.m
@@ -1720,27 +1720,23 @@ static void ggml_metal_encode_node(
                 // to the matrix-vector kernel
                 int ne11_mm_min = 1;
 -#if 0
                 // the numbers below are measured on M2 Ultra for 7B and 13B models
                 // these numbers do not translate to other devices or model sizes
                 // TODO: need to find a better approach
 -                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
 -                            switch (src0t) {
 -                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
 -                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
 -                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
 -                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
 -                                case GGML_TYPE_Q4_0:
 -                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
 -                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
 -                                case GGML_TYPE_Q5_0:                          // not tested yet
 -                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
 -                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
 -                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
 -                                default:             ne11_mm_min = 1;  break;
 -                            }
 +                        switch (src0t) {
 +                            case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
 +                            case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
 +                            case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
 +                            case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
 +                            case GGML_TYPE_Q4_0:
 +                            case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
 +                            case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
 +                            case GGML_TYPE_Q5_0:                          // not tested yet
 +                            case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
 +                            case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
 +                            case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
 +                            default:             ne11_mm_min = 1;  break;
                         }
 -#endif
                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0005-default-pretokenizer.patch
+++ b/llm/patches/0005-default-pretokenizer.patch
@ -1,44 +0,0 @@
 From 0e531d69786c4a96a3a2bcf7b2d576bd6f7edf25 Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:13 -0700
 Subject: [PATCH] 05-default-pretokenizer.diff
 ---
 src/llama.cpp | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)
 diff --git a/src/llama.cpp b/src/llama.cpp
 index 4c0a1bb6..800dfb95 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -6287,16 +6287,7 @@ static void llm_load_vocab(
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
 -            if (tokenizer_pre.empty()) {
 -                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
 -                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
 -                LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
 -                LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
 -                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
 -                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
 -            } else if (tokenizer_pre == "default") {
 +            if (tokenizer_pre == "default") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
@@ -6398,7 +6389,8 @@ static void llm_load_vocab(
                 vocab.tokenizer_add_bos = true;
                 vocab.tokenizer_clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
 +                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             }
         } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0006-embeddings.patch
+++ b/llm/patches/0006-embeddings.patch
@ -1,54 +0,0 @@
 From 235b6d876a74cb09abe26985fa89ebe5bfc9f562 Mon Sep 17 00:00:00 2001
 From: Gabe Goodhart <ghart@us.ibm.com>
 Date: Thu, 19 Sep 2024 17:06:17 -0600
 Subject: [PATCH] embeddings
 ---
 src/llama.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)
 diff --git a/src/llama.cpp b/src/llama.cpp
 index 1a8e0c51..e55ec3f8 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -16516,7 +16516,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
     // TODO: use a per-batch flag for logits presence instead
 -    const bool has_logits = !cparams.embeddings;
 +    const bool has_logits =  cparams.causal_attn;
     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -16794,20 +16794,23 @@ static int llama_decode_internal(
             // no output
             res  = nullptr;
             embd = nullptr;
 -        } else if (cparams.embeddings) {
 -            res  = nullptr; // do not extract logits for embedding case
 -            embd = nullptr;
 +        }
 +
 +        if (cparams.embeddings) {
             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
 +                embd = ggml_graph_node(gf, i);
                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
 -                    embd = ggml_graph_node(gf, i);
                     break;
                 }
             }
 -            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
         } else {
             embd = nullptr; // do not extract embeddings when not needed
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
 +
 +        if (!cparams.causal_attn) {
 +            res = nullptr; // do not extract logits when not needed
 +        }
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0007-clip-unicode.patch
+++ b/llm/patches/0007-clip-unicode.patch
@ -1,54 +0,0 @@
 From 01c42149cbdc194644a2f138598029938e0dd447 Mon Sep 17 00:00:00 2001
 From: Gabe Goodhart <ghart@us.ibm.com>
 Date: Thu, 19 Sep 2024 17:09:57 -0600
 Subject: [PATCH] clip unicode
 ---
 examples/llava/clip.cpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
 index b8941c74..3a735f17 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -40,6 +40,14 @@
 #include <cinttypes>
 #include <limits>
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
 +#ifndef NOMINMAX
 +    #define NOMINMAX
 +#endif
 +#include <windows.h>
 +#endif
 +
 #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
 #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
 #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
@@ -1227,7 +1235,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             return nullptr;
         }
 +#ifdef _WIN32
 +        int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
 +        if (!wlen) {
 +            return NULL;
 +        }
 +        wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
 +        wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
 +        if (!wlen) {
 +            free(wbuf);
 +            return NULL;
 +        }
 +        auto fin = std::ifstream(wbuf, std::ios::binary);
 +        free(wbuf);
 +#else
         auto fin = std::ifstream(fname, std::ios::binary);
 +#endif
         if (!fin) {
             LOG_ERR("cannot open model file for loading tensors\n");
             clip_free(new_clip);
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/patches/0008-solar-pro.patch
+++ b/llm/patches/0008-solar-pro.patch
@ -1,412 +0,0 @@
 From a8fe40fa7b026d2db9bb6aeecd24fcd2027110ec Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:16 -0700
 Subject: [PATCH] add solar-pro support
 solar-pro introduces block skip connections where blocks are connected
 to other, non-sequential blocks with a scale multiple
 this change adds 4 new keys to store the skip connections and one new
 tensor to store the scalar. the scalar is implemented a 1-dimensional
 tensor with 2 elements dervied from the model's bskcn_tv configuration.
 in general, the values are (bskcn_tv, 1 - bskcn_tv)
 ---
 src/llama.cpp | 270 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 255 insertions(+), 15 deletions(-)
 diff --git a/src/llama.cpp b/src/llama.cpp
 index 4c0a1bb6..c6fc0c3f 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -217,6 +217,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
 +    LLM_ARCH_SOLAR,
     LLM_ARCH_UNKNOWN,
 };
@@ -270,6 +271,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,         "granite"      },
     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
     { LLM_ARCH_CHAMELEON,       "chameleon"    },
 +    { LLM_ARCH_SOLAR,           "solar"        },
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
@@ -327,6 +329,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
 +    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_FREQ_BASE,
@@ -421,20 +424,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
 -    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
 -    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
 -    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"         },
 -    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"              },
 -    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"             },
 -    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
 -    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
 -    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
 -    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
 -    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
 -    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
 -    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
 -    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
 -    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
 +    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"               },
 +    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"            },
 +    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"           },
 +    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"                },
 +    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"               },
 +    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"             },
 +    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"       },
 +    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon"   },
 +    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                   },
 +    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"              },
 +    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"             },
 +    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count"   },
 +    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"           },
 +    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                    },
 +    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
     { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
@@ -608,6 +612,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
 +    LLM_TENSOR_BSKCN_TV,
 };
 static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -1527,6 +1532,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
 +
 +    {
 +        LLM_ARCH_SOLAR,
 +        {
 +            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
 +            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
 +            { LLM_TENSOR_OUTPUT,          "output" },
 +            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
 +            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
 +            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
 +            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
 +            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
 +            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
 +            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
 +            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
 +            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
 +            { LLM_TENSOR_BSKCN_TV,        "bskcn_tv" },
 +        },
 +    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -2360,6 +2384,7 @@ enum e_model {
     MODEL_15B,
     MODEL_16B,
     MODEL_20B,
 +    MODEL_22B,
     MODEL_30B,
     MODEL_34B,
     MODEL_35B,
@@ -2409,6 +2434,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 +    std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
 +
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q = 0;
     uint32_t n_lora_kv = 0;
@@ -2479,6 +2506,7 @@ struct llama_hparams {
         if (this->n_head_arr    != other.n_head_arr)    return true;
         if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
         if (this->n_ff_arr      != other.n_ff_arr)      return true;
 +        if (this->n_bskcn_arr   != other.n_bskcn_arr)   return true;
         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2588,6 +2616,14 @@ struct llama_hparams {
             return ssm_d_state * ssm_d_inner;
         }
     }
 +
 +    bool n_bskcn(uint32_t n, uint32_t il = 0) const {
 +        if (il < n_layer) {
 +            return n_bskcn_arr[n][il] > 0;
 +        }
 +
 +        GGML_ABORT("fatal error");
 +    }
 };
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2769,6 +2805,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_gate_scale;
     struct ggml_tensor * ffn_up_scale;
     struct ggml_tensor * ffn_down_scale;
 +
 +    struct ggml_tensor * bskcn_tv;
 };
 // very similar to llama_batch,
@@ -6134,6 +6172,21 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                }
             } break;
 +        case LLM_ARCH_SOLAR:
 +            {
 +                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 +
 +                for (int i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
 +                    auto & bskcn = hparams.n_bskcn_arr.at(i);
 +                    bskcn.fill(0);
 +                    ml.get_key_or_arr(::format(LLM_KV_NAMES.at(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION), LLM_ARCH_NAMES.at(ml.llm_kv.arch), i), bskcn, hparams.n_layer, false);
 +                }
 +
 +                switch (hparams.n_layer) {
 +                    case 64: model.type = e_model::MODEL_22B; break;
 +                    default: model.type = e_model::MODEL_UNKNOWN;
 +                }
 +            }
         default: (void)0;
     }
@@ -8839,6 +8892,37 @@ static bool llm_load_tensors(
                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 +                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
 +                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
 +                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
 +                    }
 +                } break;
 +            case LLM_ARCH_SOLAR:
 +                {
 +                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 +
 +                    // output
 +                    {
 +                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
 +                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
 +                    }
 +
 +                    for (int i = 0; i < n_layer; ++i) {
 +                        ggml_context * ctx_layer = ctx_for_layer(i);
 +                        ggml_context * ctx_split = ctx_for_layer_split(i);
 +
 +                        auto & layer = model.layers[i];
 +
 +                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 +                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
 +                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
 +                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
 +                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
 +
 +                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 +
 +                        layer.bskcn_tv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
 +
                         layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
@@ -16009,7 +16093,6 @@ struct llm_build_context {
         return gf;
     }
 -
     // ref: https://github.com/facebookresearch/chameleon
     // based on the original build_llama() function, changes:
     //   * qk-norm
@@ -16187,6 +16270,158 @@ struct llm_build_context {
         return gf;
     }
 +
 +    ggml_cgraph * build_solar() {
 +        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 +
 +        // mutable variable, needed during the last layer of the computation to skip unused tokens
 +        int32_t n_tokens = this->n_tokens;
 +
 +        const int64_t n_embd_head = hparams.n_embd_head_v;
 +        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 +        GGML_ASSERT(n_embd_head == hparams.n_rot);
 +
 +        struct ggml_tensor * cur;
 +        struct ggml_tensor * inpL;
 +
 +        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
 +
 +        // inp_pos - contains the positions
 +        struct ggml_tensor * inp_pos = build_inp_pos();
 +
 +        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
 +        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
 +
 +        struct ggml_tensor * bskcn_1;
 +        struct ggml_tensor * bskcn_2;
 +
 +        for (int il = 0; il < n_layer; ++il) {
 +            struct ggml_tensor * inpSA = inpL;
 +
 +            if (hparams.n_bskcn(0, il)) {
 +                bskcn_1 = inpSA;
 +            }
 +
 +            if (hparams.n_bskcn(1, il)) {
 +                bskcn_2 = inpSA;
 +            }
 +
 +            if (hparams.n_bskcn(2, il)) {
 +                inpSA = ggml_add(
 +                   ctx0,
 +                   ggml_mul(ctx0, bskcn_1, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
 +                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
 +            }
 +
 +            if (hparams.n_bskcn(3, il)) {
 +                inpSA = ggml_add(
 +                   ctx0,
 +                   ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
 +                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
 +            }
 +
 +            // norm
 +            cur = llm_build_norm(ctx0, inpL, hparams,
 +                    model.layers[il].attn_norm, NULL,
 +                    LLM_NORM_RMS, cb, il);
 +            cb(cur, "attn_norm", il);
 +
 +            // self-attention
 +            {
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
 +                struct ggml_tensor * rope_factors = build_rope_factors(il);
 +
 +                // compute Q and K and RoPE them
 +                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
 +                cb(Qcur, "Qcur", il);
 +                if (model.layers[il].bq) {
 +                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
 +                    cb(Qcur, "Qcur", il);
 +                }
 +
 +                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
 +                cb(Kcur, "Kcur", il);
 +                if (model.layers[il].bk) {
 +                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
 +                    cb(Kcur, "Kcur", il);
 +                }
 +
 +                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
 +                cb(Vcur, "Vcur", il);
 +                if (model.layers[il].bv) {
 +                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
 +                    cb(Vcur, "Vcur", il);
 +                }
 +
 +                Qcur = ggml_rope_ext(
 +                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
 +                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
 +                    ext_factor, attn_factor, beta_fast, beta_slow
 +                );
 +                cb(Qcur, "Qcur", il);
 +
 +                Kcur = ggml_rope_ext(
 +                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
 +                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
 +                    ext_factor, attn_factor, beta_fast, beta_slow
 +                );
 +                cb(Kcur, "Kcur", il);
 +
 +                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
 +                        model.layers[il].wo, model.layers[il].bo,
 +                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 +            }
 +
 +            if (il == n_layer - 1) {
 +                // skip computing output for unused tokens
 +                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
 +                n_tokens = n_outputs;
 +                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
 +                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
 +            }
 +
 +            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
 +            cb(ffn_inp, "ffn_inp", il);
 +
 +            // feed-forward network
 +            cur = llm_build_norm(ctx0, ffn_inp, hparams,
 +                    model.layers[il].ffn_norm, NULL,
 +                    LLM_NORM_RMS, cb, il);
 +            cb(cur, "ffn_norm", il);
 +
 +            cur = llm_build_ffn(ctx0, lctx, cur,
 +                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
 +                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
 +                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
 +                    NULL,
 +                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
 +            cb(cur, "ffn_out", il);
 +
 +            cur = ggml_add(ctx0, cur, ffn_inp);
 +            cb(cur, "ffn_out", il);
 +
 +            cur = lctx.cvec.apply_to(ctx0, cur, il);
 +            cb(cur, "l_out", il);
 +
 +            // input for next layer
 +            inpL = cur;
 +        }
 +
 +        cur = inpL;
 +
 +        cur = llm_build_norm(ctx0, cur, hparams,
 +                model.output_norm, NULL,
 +                LLM_NORM_RMS, cb, -1);
 +        cb(cur, "result_norm", -1);
 +
 +        // lm_head
 +        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
 +        cb(cur, "result_output", -1);
 +
 +        ggml_build_forward_expand(gf, cur);
 +
 +        return gf;
 +    }
 };
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16451,6 +16686,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_chameleon();
             } break;
 +        case LLM_ARCH_SOLAR:
 +            {
 +                result = llm.build_solar();
 +            } break;
         default:
             GGML_ABORT("fatal error");
     }
@@ -19594,6 +19833,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:
 +        case LLM_ARCH_SOLAR:
             return LLAMA_ROPE_TYPE_NORM;
         // the pairs of head values are offset by n_rot/2
 -- 
 2.39.3 (Apple Git-146)
--- a/llm/server.go
+++ b/llm/server.go
@ -186,7 +186,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		"--model", model,
 		"--ctx-size", strconv.Itoa(opts.NumCtx),
 		"--batch-size", strconv.Itoa(opts.NumBatch),
 		"--embedding",
 	}
 	if opts.NumGPU >= 0 {
@ -218,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		params = append(params, "--threads", strconv.Itoa(defaultThreads))
 	}
 	if !opts.F16KV {
 		params = append(params, "--memory-f32")
 	}
 	flashAttnEnabled := envconfig.FlashAttention()
 	for _, g := range gpus {
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@ -440,7 +440,6 @@ func TestParseFileParameters(t *testing.T) {
 		"num_gpu 1":                    {"num_gpu", "1"},
 		"main_gpu 1":                   {"main_gpu", "1"},
 		"low_vram true":                {"low_vram", "true"},
 		"f16_kv true":                  {"f16_kv", "true"},
 		"logits_all true":              {"logits_all", "true"},
 		"vocab_only true":              {"vocab_only", "true"},
 		"use_mmap true":                {"use_mmap", "true"},
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@ -6,23 +6,18 @@ set -e
 mkdir -p dist
 # These require Xcode v13 or older to target MacOS v11
 # If installed to an alternate location use the following to enable
 # export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
 # export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
 export CGO_CFLAGS=-mmacosx-version-min=11.3
 export CGO_CXXFLAGS=-mmacosx-version-min=11.3
 export CGO_LDFLAGS=-mmacosx-version-min=11.3
 for TARGETARCH in arm64 amd64; do
    if [ -n "${OLLAMA_NEW_RUNNERS}" ]; then
    echo "Building Go runner darwin $TARGETARCH"
    rm -rf llama/build
    GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
    else
        echo "Building C++ runner darwin $TARGETARCH"
        rm -rf llm/build
        GOOS=darwin GOARCH=$TARGETARCH go generate ./...
    fi
    # These require Xcode v13 or older to target MacOS v11
    # If installed to an alternate location use the following to enable
    # export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
    # export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
    export CGO_CFLAGS=-mmacosx-version-min=11.3
    export CGO_CXXFLAGS=-mmacosx-version-min=11.3
    export CGO_LDFLAGS=-mmacosx-version-min=11.3
    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
 done
--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
@ -19,7 +19,7 @@ docker buildx build \
    ${LOAD_OR_PUSH} \
    --platform=${PLATFORM} \
    ${OLLAMA_COMMON_BUILD_ARGS} \
-    -f ${DOCKERFILE_DIR}Dockerfile \
+    -f Dockerfile \
    -t ${FINAL_IMAGE_REPO}:$VERSION \
    .
@ -29,7 +29,7 @@ if echo $PLATFORM | grep "amd64" > /dev/null; then
        --platform=linux/amd64 \
        ${OLLAMA_COMMON_BUILD_ARGS} \
        --target runtime-rocm \
-        -f ${DOCKERFILE_DIR}Dockerfile \
+        -f Dockerfile \
        -t ${FINAL_IMAGE_REPO}:$VERSION-rocm \
        .
 fi
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@ -19,7 +19,7 @@ docker buildx build \
        --platform=${PLATFORM} \
        ${OLLAMA_COMMON_BUILD_ARGS} \
        --target dist \
-        -f ${DOCKERFILE_DIR}Dockerfile \
+        -f Dockerfile \
        .
 # buildx behavior changes for single vs. multiplatform
--- a/scripts/build_remote.py
+++ b/scripts/build_remote.py
@ -1,76 +0,0 @@
 #!/usr/bin/env python3
 import subprocess
 import sys
 from urllib.parse import urlparse
 from git import Repo
 # Helper script to be able to build on remote repos using git to push local changes
 # (e.g. particularly helpful to target a remote windows build system)
 #
 # Typical windows remote git config looks like this:
 #
 #[remote "windows-pa"]
 #        url = jdoe@desktop-foo:C:/Users/Jdoe/code/ollama
 #        fetch = +refs/heads/*:refs/remotes/windows-pa/*
 #        uploadpack = powershell git upload-pack
 #        receivepack = powershell git receive-pack
 #
 # TODO - add argpare and make this more configurable 
 # - force flag becomes optional
 # - generate, build or test ...
 # Note: remote repo will need this run once:
 # git config --local receive.denyCurrentBranch updateInstead
 repo = Repo(".")
 # On linux, add links in /usr/local/bin to the go binaries to avoid needing this
 # GoCmd = "/usr/local/go/bin/go" 
 GoCmd = "go" 
 if repo.is_dirty():
    print("Tree is dirty.  Commit your changes before running this script")
    sys.exit(1)
 if len(sys.argv) != 2:
    print("Please specify the remote name: " + ', '.join([r.name for r in repo.remotes]))
    sys.exit(1)
 remote_name = sys.argv[1]
 remote = {r.name: r for r in repo.remotes}[remote_name]
 raw_url = list(remote.urls)[0]
 url = urlparse(raw_url)
 # Windows urls don't quite parse properly
 if url.scheme == "" and url.netloc == "":
    url = urlparse("ssh://" + raw_url)
 print("URL: " + str(url))
 netloc = url.netloc.split(":")[0]
 path = url.path
 branch_name = repo.active_branch.name
 print("Force pushing content to remote...")
 # Use with care given the force push
 remote.push(force=True).raise_if_error()
 print("Ensuring correct branch checked out on remote via ssh...")
 subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'git', 'checkout', branch_name])
 # TODO - add some hardening to try to figure out how to set up the path properly
 # subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'env'])
 # TODO - or consider paramiko maybe
 print("Running Windows Build Script")
 subprocess.check_call(['ssh', netloc, 'cd', path, ';', "powershell", "-ExecutionPolicy", "Bypass", "-File", "./scripts/build_windows.ps1"])
 # print("Building")
 # subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.'])
 print("Copying built result")
 subprocess.check_call(['scp', netloc +":"+ path + "/ollama.exe",  './dist/'])
 print("Copying installer")
 subprocess.check_call(['scp', netloc +":"+ path + "/dist/Ollama Setup.exe",  './dist/'])
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@ -75,7 +75,6 @@ function checkEnv() {
    } else {
        write-host "Code signing disabled - please set KEY_CONTAINERS to sign and copy ollama_inc.crt to the top of the source tree"
    }
 }
@ -83,50 +82,7 @@ function buildOllama() {
    if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
        write-host "Building ollama runners"
        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
        if ($null -eq ${env:OLLAMA_NEW_RUNNERS}) {
            # Start by skipping CUDA to build everything else
            write-host "Building ollama runners"
            powershell -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
            # Then skip everyhting else and build all the CUDA variants
            foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
                write-host "Building CUDA ${env:CUDA_LIB_DIR} runner"
                if ($env:CUDA_LIB_DIR.Contains("v12")) {
                    powershell -Command {
                        $env:OLLAMA_SKIP_CUDA_GENERATE=""
                        $env:OLLAMA_SKIP_STATIC_GENERATE="1"
                        $env:OLLAMA_SKIP_CPU_GENERATE="1"
                        $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
                        $env:OLLAMA_SKIP_ROCM_GENERATE="1"
                        $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
                        $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on"
                        $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
                        $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
                        & go generate ./...
                    }
                } else {
                    powershell -Command {
                        $env:OLLAMA_SKIP_CUDA_GENERATE=""
                        $env:OLLAMA_SKIP_STATIC_GENERATE="1"
                        $env:OLLAMA_SKIP_CPU_GENERATE="1"
                        $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
                        $env:OLLAMA_SKIP_ROCM_GENERATE="1"
                        $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
                        $env:OLLAMA_CUSTOM_CUDA_DEFS=""
                        $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
                        $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
                        & go generate ./...
                    }
                }
                if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            }
        } else {
        & make -C llama -j 12
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    } else {
        write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
@ -134,11 +90,6 @@ function buildOllama() {
    write-host "Building ollama CLI"
    & go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ("${env:KEY_CONTAINER}") {
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    }
    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
 }
@ -149,11 +100,6 @@ function buildApp() {
    & windres -l 0 -o ollama.syso ollama.rc
    & go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" -o "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}-app.exe" .
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    if ("${env:KEY_CONTAINER}") {
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}-app.exe"
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    }
 }
 function gatherDependencies() {
@ -172,7 +118,7 @@ function gatherDependencies() {
    } else {
        $depArch=$script:TARGET_ARCH
    }
-    if ($depArch -eq "amd64") {
+    if ($depArch -eq "x64") {
        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DIST_DIR}\lib\ollama\"
        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DIST_DIR}\lib\ollama\"
        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DIST_DIR}\lib\ollama\"
@ -186,16 +132,19 @@ function gatherDependencies() {
        copy-item -path "${env:VCToolsRedistDir}\vc_redist.arm64.exe" -destination "${script:DIST_DIR}" -verbose
    }
    cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
 }
 function sign() {
    if ("${env:KEY_CONTAINER}") {
-        write-host "about to sign"
+        write-host "Signing Ollama executables, scripts and libraries"
        foreach ($file in (get-childitem "${script:DIST_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
            write-host "signing $file"
        & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
-                /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
+            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} `
            $(get-childitem -path "${script:SRC_DIR}\dist" -r -include @('ollama_welcome.ps1')) `
            $(get-childitem -path "${script:SRC_DIR}\dist\windows-*" -r -include @('*.exe', '*.dll'))
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
+    } else {
        write-host "Signing not enabled"
    }
 }
@ -226,6 +175,7 @@ try {
        buildOllama
        buildApp
        gatherDependencies
        sign
        buildInstaller
        distZip
    } else {
--- a/scripts/env.sh
+++ b/scripts/env.sh
@ -20,12 +20,6 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
    --build-arg=CUSTOM_CPU_FLAGS \
    --build-arg=GPU_RUNNER_CPU_FLAGS \
    --build-arg=AMDGPU_TARGETS"
 OLLAMA_NEW_RUNNERS=${OLLAMA_NEW_RUNNERS:-""}
 if [ -n "${OLLAMA_NEW_RUNNERS}" ]; then
    DOCKERFILE_DIR="./llama/"
 else
    DOCKERFILE_DIR="./"
 fi
 echo "Building Ollama"
 echo "VERSION=$VERSION"
--- a/server/imageproc/images_test.go
+++ b/server/imageproc/images_test.go
@ -120,6 +120,78 @@ func TestGetOptimalTiledCanvas(t *testing.T) {
 			TileSize:      560,
 			Expected:      image.Point{1120, 1120},
 		},
 		{
 			ImageSize:     image.Point{800, 600},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{1120, 1120},
 		},
 		{
 			ImageSize:     image.Point{640, 480},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{1120, 560},
 		},
 		{
 			ImageSize:     image.Point{320, 200},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{560, 560},
 		},
 		{
 			ImageSize:     image.Point{1320, 200},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{1680, 560},
 		},
 		{
 			ImageSize:     image.Point{2000, 200},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{2240, 560},
 		},
 		{
 			ImageSize:     image.Point{10000, 200},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{2240, 560},
 		},
 		{
 			ImageSize:     image.Point{480, 640},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{560, 1120},
 		},
 		{
 			ImageSize:     image.Point{200, 320},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{560, 560},
 		},
 		{
 			ImageSize:     image.Point{200, 1320},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{560, 1680},
 		},
 		{
 			ImageSize:     image.Point{200, 2000},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{560, 2240},
 		},
 		{
 			ImageSize:     image.Point{200, 10000},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{560, 2240},
 		},
 		{
 			ImageSize:     image.Point{10000, 10000},
 			MaxImageTiles: 4,
 			TileSize:      560,
 			Expected:      image.Point{1120, 1120},
 		},
 	}
 	for _, c := range cases {
--- a/server/images.go
+++ b/server/images.go
@ -690,7 +690,8 @@ func CopyModel(src, dst model.Name) error {
 }
 func deleteUnusedLayers(deleteMap map[string]struct{}) error {
-	manifests, err := Manifests()
+	// Ignore corrupt manifests to avoid blocking deletion of layers that are freshly orphaned
 	manifests, err := Manifests(true)
 	if err != nil {
 		return err
 	}
@ -853,8 +854,8 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	manifest, _, err := GetManifest(mp)
 	if errors.Is(err, os.ErrNotExist) {
 		// noop
-	} else if err != nil && !errors.Is(err, os.ErrNotExist) {
+	} else if err != nil {
-		return err
+		slog.Warn("pulling model with bad existing manifest", "name", name, "error", err)
 	} else {
 		for _, l := range manifest.Layers {
 			deleteMap[l.Digest] = struct{}{}
--- a/server/layer.go
+++ b/server/layer.go
@ -106,7 +106,8 @@ func (l *Layer) Remove() error {
 		return nil
 	}
-	ms, err := Manifests()
+	// Ignore corrupt manifests to avoid blocking deletion of layers that are freshly orphaned
 	ms, err := Manifests(true)
 	if err != nil {
 		return err
 	}
--- a/server/manifest.go
+++ b/server/manifest.go
@ -123,7 +123,7 @@ func WriteManifest(name model.Name, config Layer, layers []Layer) error {
 	return json.NewEncoder(f).Encode(m)
 }
-func Manifests() (map[model.Name]*Manifest, error) {
+func Manifests(continueOnError bool) (map[model.Name]*Manifest, error) {
 	manifests, err := GetManifestPath()
 	if err != nil {
 		return nil, err
@ -145,22 +145,29 @@ func Manifests() (map[model.Name]*Manifest, error) {
 		if !fi.IsDir() {
 			rel, err := filepath.Rel(manifests, match)
 			if err != nil {
 				if !continueOnError {
 					return nil, fmt.Errorf("%s %w", match, err)
 				}
 				slog.Warn("bad filepath", "path", match, "error", err)
 				continue
 			}
 			n := model.ParseNameFromFilepath(rel)
 			if !n.IsValid() {
 				if !continueOnError {
 					return nil, fmt.Errorf("%s %w", rel, err)
 				}
 				slog.Warn("bad manifest name", "path", rel)
 				continue
 			}
 			m, err := ParseNamedManifest(n)
-			if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
+			if err != nil {
 				if !continueOnError {
 					return nil, fmt.Errorf("%s %w", n, err)
 				}
 				slog.Warn("bad manifest", "name", n, "error", err)
 				continue
 			} else if err != nil {
 				return nil, fmt.Errorf("%s: %w", n, err)
 			}
 			ms[n] = m
--- a/server/manifest_test.go
+++ b/server/manifest_test.go
@ -112,7 +112,7 @@ func TestManifests(t *testing.T) {
 				createManifest(t, d, p)
 			}
-			ms, err := Manifests()
+			ms, err := Manifests(true)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/server/prompt.go
+++ b/server/prompt.go
@ -27,6 +27,16 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	isMllama := checkMllamaModelFamily(m)
 	var imageNumTokens int
 	// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
 	if isMllama {
 		// Our mllama implementation packs all of the embeddings into a single token
 		imageNumTokens = 1
 	} else {
 		// Clip images are represented as 768 tokens, each an embedding
 		imageNumTokens = 768
 	}
 	n := len(msgs) - 1
 	// in reverse, find all messages that fit into context window
 	for i := n; i >= 0; i-- {
@ -59,9 +69,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 		ctxLen := len(s)
 		if m.ProjectorPaths != nil {
 			for _, m := range msgs[i:] {
-				// images are represented as 768 sized embeddings
+				ctxLen += imageNumTokens * len(m.Images)
 				// TODO: get embedding length from project metadata
 				ctxLen += 768 * len(m.Images)
 			}
 		}
@ -75,11 +83,16 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	currMsgIdx := n
 	for cnt, msg := range msgs[currMsgIdx:] {
 		prefix := ""
 		imgPrompt := ""
 		prompt := msg.Content
 		for _, i := range msg.Images {
 			var imgData llm.ImageData
 			if isMllama {
-		lastMsgIdx := len(msgs) - 1
+				data, aspectRatioID, err := imageproc.Preprocess(i)
 		for i := lastMsgIdx; i >= currMsgIdx; i-- {
 			if len(msgs[i].Images) > 0 {
 				data, aspectRatioID, err := imageproc.Preprocess(msgs[i].Images[0])
 				if err != nil {
 					return "", nil, err
 				}
@ -90,25 +103,19 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 					return "", nil, err
 				}
-				imgData := llm.ImageData{
+				imgData = llm.ImageData{
 					ID:            len(images),
 					Data:          buf.Bytes(),
 					AspectRatioID: aspectRatioID,
 				}
-
+				imgPrompt = "<|image|>"
 				msgs[i].Content = strings.TrimSpace("<|image|>" + msgs[i].Content)
 				images = append(images, imgData)
 				break
 			}
 		}
 			} else {
-		for cnt, msg := range msgs[currMsgIdx:] {
+				imgData = llm.ImageData{
 			prefix := ""
 			prompt := msg.Content
 			for _, i := range msg.Images {
 				imgData := llm.ImageData{
 					ID:   len(images),
 					Data: i,
 				}
 				imgPrompt = " "
 			}
 			imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
 			if !strings.Contains(prompt, "[img]") {
@ -119,8 +126,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 			images = append(images, imgData)
 		}
-			msgs[currMsgIdx+cnt].Content = strings.TrimSpace(prefix + " " + prompt)
+		msgs[currMsgIdx+cnt].Content = strings.TrimSpace(prefix + imgPrompt + prompt)
 		}
 	}
 	// truncate any messages that do not fit into the context window
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@ -249,7 +249,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
 			},
 			expect: expect{
-				prompt:        "<|image|>How many hotdogs are in this image? ",
+				prompt:        "[img-0]<|image|>How many hotdogs are in this image? ",
 				images:        [][]byte{imgBuf},
 				aspectRatioID: 1,
 			},
@ -264,7 +264,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
 			},
 			expect: expect{
-				prompt:        "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ",
+				prompt:        "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
 				images:        [][]byte{imgBuf},
 				aspectRatioID: 1,
 			},
@ -279,8 +279,8 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
 			},
 			expect: expect{
-				prompt:        "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ",
+				prompt:        "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
-				images:        [][]byte{imgBuf2},
+				images:        [][]byte{imgBuf, imgBuf2},
 				aspectRatioID: 1,
 			},
 		},
@ -294,7 +294,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "Which ones have mustard?"},
 			},
 			expect: expect{
-				prompt:        "<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
+				prompt:        "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
 				images:        [][]byte{imgBuf},
 				aspectRatioID: 1,
 			},
--- a/server/routes.go
+++ b/server/routes.go
@ -205,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				return
 			}
-			images[i] = llm.ImageData{Data: buf.Bytes(), AspectRatioID: aspectRatioID}
+			images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: aspectRatioID}
 		} else {
 			images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
 		}
@ -239,11 +239,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			}
 			for _, i := range images {
 				imgPrompt := ""
 				if isMllama {
-					msgs = append(msgs, api.Message{Role: "user", Content: "<|image|>"})
+					imgPrompt = "<|image|>"
 				} else {
 					msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)})
 				}
 				msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
 			}
 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
@ -267,7 +267,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}
-	slog.Debug("generate request", "prompt", prompt, "images", images)
+	slog.Debug("generate request", "images", len(images), "prompt", prompt)
 	ch := make(chan any)
 	go func() {
@ -622,7 +622,7 @@ func (s *Server) PushHandler(c *gin.Context) {
 }
 func checkNameExists(name model.Name) error {
-	names, err := Manifests()
+	names, err := Manifests(true)
 	if err != nil {
 		return err
 	}
@ -894,7 +894,7 @@ func getKVData(digest string, verbose bool) (llm.KV, error) {
 }
 func (s *Server) ListHandler(c *gin.Context) {
-	ms, err := Manifests()
+	ms, err := Manifests(true)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
@ -1211,6 +1211,9 @@ func Serve(ln net.Listener) error {
 	}
 	if !envconfig.NoPrune() {
 		if _, err := Manifests(false); err != nil {
 			slog.Warn("corrupt manifests detected, skipping prune operation.  Re-pull or delete to clear", "error", err)
 		} else {
 			// clean up unused layers and manifests
 			if err := PruneLayers(); err != nil {
 				return err
@ -1225,6 +1228,7 @@ func Serve(ln net.Listener) error {
 				return err
 			}
 		}
 	}
 	ctx, done := context.WithCancel(context.Background())
 	schedCtx, schedDone := context.WithCancel(ctx)
--- a/server/sched.go
+++ b/server/sched.go
@ -130,11 +130,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				continue
 			}
 			numParallel := int(envconfig.NumParallel())
-			// TODO (jmorganca): multimodal models don't support parallel yet
+			// TODO (jmorganca): mllama doesn't support parallel yet
 			// see https://github.com/ollama/ollama/issues/4165
-			if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
+			if checkMllamaModelFamily(pending.model) && numParallel != 1 {
 				numParallel = 1
-				slog.Warn("multimodal models don't support parallel requests yet")
+				slog.Warn("mllama doesn't support parallel requests yet")
 			}
 			for {
		`@ -0,0 +1 @@`
							`LLAMACPP_BASE_COMMIT=3f1ae2e32cde00c39b96be6d01c2997c29bae555`
		`@ -1,3 +0,0 @@`
			`package generate`

			`//go:generate bash ./gen_darwin.sh`
		`@ -1,3 +0,0 @@`
			`package generate`

			`//go:generate bash ./gen_linux.sh`
		`@ -1,3 +0,0 @@`
			`package generate`

			`//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1`
		`@ -1 +0,0 @@`
			`Subproject commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555`