From b754f5a6a36d6f3068e938af525d8b056c7f9bce Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 30 Oct 2024 10:34:28 -0700
Subject: [PATCH] Remove submodule and shift to Go server - 0.4.0  (#7157)

* Remove llama.cpp submodule and shift new build to top

* CI: install msys and clang gcc on win

Needed for deepseek to work properly on windows
---
 .dockerignore                               |    2 -
 .gitattributes                              |    1 -
 .github/workflows/release.yaml              |  172 +-
 .github/workflows/test.yaml                 |  170 +-
 .gitmodules                                 |    4 -
 Dockerfile                                  |  284 +-
 Makefile                                    |    4 +
 docs/development.md                         |  186 +-
 llama/Dockerfile                            |  221 -
 llama/README.md                             |   30 +-
 llama/llama.go                              |    2 +
 llama/make/Makefile.sync                    |    9 +-
 llama/vendoring                             |    1 +
 llm/ext_server/CMakeLists.txt               |   15 -
 llm/ext_server/httplib.h                    | 8794 -------------------
 llm/ext_server/server.cpp                   | 3227 -------
 llm/ext_server/utils.hpp                    |  661 --
 llm/generate/gen_common.sh                  |  137 -
 llm/generate/gen_darwin.sh                  |   91 -
 llm/generate/gen_linux.sh                   |  285 -
 llm/generate/gen_windows.ps1                |  403 -
 llm/generate/generate_darwin.go             |    3 -
 llm/generate/generate_linux.go              |    3 -
 llm/generate/generate_windows.go            |    3 -
 llm/llama.cpp                               |    1 -
 llm/patches/0000-cmakelist.patch            |   22 -
 llm/patches/0001-load-progress.patch        |   44 -
 llm/patches/0002-clip-log.patch             |   24 -
 llm/patches/0003-load_exception.patch       |   57 -
 llm/patches/0004-metal.patch                |   57 -
 llm/patches/0005-default-pretokenizer.patch |   44 -
 llm/patches/0006-embeddings.patch           |   54 -
 llm/patches/0007-clip-unicode.patch         |   54 -
 llm/patches/0008-solar-pro.patch            |  412 -
 scripts/build_darwin.sh                     |   12 +-
 scripts/build_docker.sh                     |    4 +-
 scripts/build_linux.sh                      |    2 +-
 scripts/build_remote.py                     |   76 -
 scripts/build_windows.ps1                   |   49 +-
 scripts/env.sh                              |    6 -
 40 files changed, 366 insertions(+), 15260 deletions(-)
 delete mode 100644 .gitmodules
 create mode 100644 Makefile
 delete mode 100644 llama/Dockerfile
 create mode 100644 llama/vendoring
 delete mode 100644 llm/ext_server/CMakeLists.txt
 delete mode 100644 llm/ext_server/httplib.h
 delete mode 100644 llm/ext_server/server.cpp
 delete mode 100644 llm/ext_server/utils.hpp
 delete mode 100644 llm/generate/gen_common.sh
 delete mode 100755 llm/generate/gen_darwin.sh
 delete mode 100755 llm/generate/gen_linux.sh
 delete mode 100644 llm/generate/gen_windows.ps1
 delete mode 100644 llm/generate/generate_darwin.go
 delete mode 100644 llm/generate/generate_linux.go
 delete mode 100644 llm/generate/generate_windows.go
 delete mode 160000 llm/llama.cpp
 delete mode 100644 llm/patches/0000-cmakelist.patch
 delete mode 100644 llm/patches/0001-load-progress.patch
 delete mode 100644 llm/patches/0002-clip-log.patch
 delete mode 100644 llm/patches/0003-load_exception.patch
 delete mode 100644 llm/patches/0004-metal.patch
 delete mode 100644 llm/patches/0005-default-pretokenizer.patch
 delete mode 100644 llm/patches/0006-embeddings.patch
 delete mode 100644 llm/patches/0007-clip-unicode.patch
 delete mode 100644 llm/patches/0008-solar-pro.patch
 delete mode 100755 scripts/build_remote.py

diff --git a/.dockerignore b/.dockerignore
index fada7a9b..76704c36 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -3,9 +3,7 @@ ollama
 app
 macapp
 dist
-llm/llama.cpp
 .env
 .cache
 test_data
-llm/build
 llama/build
diff --git a/.gitattributes b/.gitattributes
index 932ddcc9..51635caa 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,4 +1,3 @@
-llm/ext_server/* linguist-vendored
 llama/**/*.cpp linguist-vendored
 llama/**/*.hpp linguist-vendored
 llama/**/*.h linguist-vendored
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index ac4c19b0..b7b94098 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -48,8 +48,8 @@ jobs:
         with:
           name: dist-darwin
           path: |
-            dist/*arwin*
-            !dist/*-cov
+            dist/Ollama-darwin.zip
+            dist/ollama-darwin
 
   # Windows builds take a long time to both install the dependencies and build, so parallelize
   # CPU generation step
@@ -85,6 +85,24 @@ jobs:
           write-host "Installing plugin"
           & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
           write-host "plugin installed"
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
@@ -92,19 +110,19 @@ jobs:
       - run: go get ./...
       - run: |
           $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
           $env:PATH="$gopath;$env:PATH"
-          go generate -x ./...
-        name: go generate
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
+          make -j $cores
+        name: make
       - uses: actions/upload-artifact@v4
         with:
           name: generate-windows-cpu
           path: |
             build/**/*
             build/**/*.a
-            llm/build/**/*.a
             dist/windows-amd64/**
 
   # ROCm generation step
@@ -140,6 +158,24 @@ jobs:
           write-host "Installing plugin"
           & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
           write-host "plugin installed"
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
@@ -158,31 +194,21 @@ jobs:
       - run: go get ./...
       - run: |
           $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
           $env:PATH="$gopath;$env:PATH"
           $env:OLLAMA_SKIP_CPU_GENERATE="1"
           $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          go generate -x ./...
-        name: go generate
-      - name: 'gather rocm dependencies'
-        run: |
-          $HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          md "dist\deps\bin\rocblas\library"
-          cp "${HIP_PATH}\bin\hipblas.dll" "dist\deps\bin\"
-          cp "${HIP_PATH}\bin\rocblas.dll" "dist\deps\bin\"
-          cp "${HIP_PATH}\bin\rocblas\library\*" "dist\deps\bin\rocblas\library\"
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
+          make -j $cores
+        name: make
       - uses: actions/upload-artifact@v4
         with:
           name: generate-windows-rocm
           path: |
             build/**/*
             dist/windows-amd64/**
-      - uses: actions/upload-artifact@v4
-        with:
-          name: windows-rocm-deps
-          path: dist/deps/*
 
   # CUDA generation step
   generate-windows-cuda:
@@ -224,6 +250,24 @@ jobs:
           write-host "Installing plugin"
           & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
           write-host "plugin installed"
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
@@ -245,34 +289,23 @@ jobs:
       - name: 'Verify CUDA'
         run: nvcc -V
       - run: go get ./...
-      - name: go generate
+      - name: make
         run: |
           $gopath=(get-command go).source | split-path -parent
           $cudabin=(get-command nvcc).source | split-path
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
           $env:PATH="$gopath;$cudabin;$env:PATH"
           $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          go generate -x ./...
-      - name: 'gather cuda dependencies'
-        run: |
-          $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0]
-          md "dist\deps"
-          cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\"
-          cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\"
-          cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
+          make -j $cores
       - uses: actions/upload-artifact@v4
         with:
           name: generate-windows-cuda-${{ matrix.cuda.version }}
           path: |
             build/**/*
             dist/windows-amd64/**
-      - uses: actions/upload-artifact@v4
-        with:
-          name: windows-cuda-deps-${{ matrix.cuda.version }}
-          path: dist/deps/*
-
 
   # windows arm64 generate, go build, and zip file (no installer)
   # Output of this build is aggregated into the final x86 build
@@ -292,6 +325,30 @@ jobs:
           choco install -y --no-progress git gzip
           echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
           echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      # pacman is buggy on win arm64, so we avoid using it, but rely on the binary artifacts
+      # we download the sfx (7zip bundle) which isn't fully set up, but the binaries we need to build work
+      - name: Install msys2 x64
+        run: |
+          $url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-base-x86_64-20240727.sfx.exe"
+          write-host "Downloading MSYS2"
+          Invoke-WebRequest -Uri "$url" -outfile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @(
+              '-y', '-oC:\'
+              ) -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      # since pacman isn't reliable, we just download the tar file and extract directly
+      - name: Downloading and extracting msys2 make tar file
+        run: |
+          $url="https://mirror.msys2.org/msys/x86_64/make-4.4.1-2-x86_64.pkg.tar.zst"
+          write-host "Downloading make"
+          Invoke-WebRequest -Uri "$url" -outfile c:\msys64\make.tar.zst
+          cd c:\msys64; tar -xf make.tar.zst
+          rm c:\msys64\make.tar.zst
+      - name: Verify Make works properly
+        run: |
+          echo $env:PATH
+          make --version
       - name: Install Visual Studio 2022
         run: |
           $components = @(
@@ -385,10 +442,9 @@ jobs:
       - run: |
           $gopath=(get-command go).source | split-path -parent
           $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$gccpath;$env:PATH;C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin"
+          import-module 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\Program Files\Microsoft Visual Studio\2022\Community' -skipautomaticlocation
+          $env:PATH="$gopath;$gccpath;$env:PATH"
           echo $env:PATH
           $env:ARCH="arm64"
           .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies distZip
@@ -441,6 +497,24 @@ jobs:
           write-host "Installing plugin"
           & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
           write-host "plugin installed"
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
@@ -455,15 +529,6 @@ jobs:
       - uses: actions/download-artifact@v4
         with:
           name: generate-windows-cuda-12
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-cuda-deps-11
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-cuda-deps-12
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-rocm-deps
       - uses: actions/download-artifact@v4
         with:
           name: generate-windows-rocm
@@ -474,11 +539,12 @@ jobs:
       - run: dir build
       - run: |
           $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
           $env:PATH="$gopath;$env:PATH"
           $env:OLLAMA_SKIP_GENERATE="1"
+          $env:ARCH="amd64"
           & .\scripts\build_windows.ps1
       - uses: actions/upload-artifact@v4
         with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 10dbabe6..10065182 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -21,9 +21,6 @@ jobs:
   changes:
     runs-on: ubuntu-latest
     outputs:
-      GENERATE: ${{ steps.changes.outputs.GENERATE }}
-      GENERATE_CUDA: ${{ steps.changes.outputs.GENERATE_CUDA }}
-      GENERATE_ROCM: ${{ steps.changes.outputs.GENERATE_ROCM }}
       RUNNERS: ${{ steps.changes.outputs.RUNNERS }}
     steps:
       - uses: actions/checkout@v4
@@ -39,53 +36,12 @@ jobs:
           }
 
           {
-            echo GENERATE=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
-            echo GENERATE_CUDA=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
-            echo GENERATE_ROCM=$(changed 'llm/llama.cpp' 'llm/patches/**' 'llm/ext_server/**' 'llm/generate/**')
             echo RUNNERS=$(changed 'llama/**')
           } >>$GITHUB_OUTPUT
 
-  generate:
+  runners-linux-cuda:
     needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE == 'True' }}
-    strategy:
-      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
-        arch: [amd64, arm64]
-        exclude:
-          - os: ubuntu-latest
-            arch: arm64
-          - os: windows-2019
-            arch: arm64
-    runs-on: ${{ matrix.os }}
-    env:
-      GOARCH: ${{ matrix.arch }}
-      CGO_ENABLED: '1'
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - run: go get ./...
-      - run: |
-          $gopath=(get-command go).source | split-path -parent
-          $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$gccpath;$env:PATH"
-          echo $env:PATH
-          go generate -x ./...
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        name: 'Windows Go Generate'
-      - run: go generate -x ./...
-        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        name: 'Unix Go Generate'
-      - run: go build .
-  generate-cuda:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
     strategy:
       matrix:
         cuda-version:
@@ -95,8 +51,6 @@ jobs:
     steps:
       - run: |
           apt-get update && apt-get install -y git build-essential curl
-          curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
-            | tar -zx -C /usr --strip-components 1
         env:
           DEBIAN_FRONTEND: noninteractive
       - uses: actions/checkout@v4
@@ -107,12 +61,11 @@ jobs:
       - run: go get ./...
       - run: |
           git config --global --add safe.directory /__w/ollama/ollama
-          go generate -x ./...
-        env:
-          OLLAMA_SKIP_CPU_GENERATE: '1'
-  generate-rocm:
+          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
+          make -j $cores cuda_v11
+  runners-linux-rocm:
     needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
     strategy:
       matrix:
         rocm-version:
@@ -122,8 +75,6 @@ jobs:
     steps:
       - run: |
           apt-get update && apt-get install -y git build-essential curl rocm-libs
-          curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
-            | tar -zx -C /usr --strip-components 1
         env:
           DEBIAN_FRONTEND: noninteractive
       - uses: actions/checkout@v4
@@ -134,14 +85,13 @@ jobs:
       - run: go get ./...
       - run: |
           git config --global --add safe.directory /__w/ollama/ollama
-          go generate -x ./...
-        env:
-          OLLAMA_SKIP_CPU_GENERATE: '1'
+          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
+          make -j $cores rocm
 
   # ROCm generation step
-  generate-windows-rocm:
+  runners-windows-rocm:
     needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
     runs-on: windows
     steps:
       - uses: actions/checkout@v4
@@ -160,24 +110,42 @@ jobs:
       - name: 'Verify ROCm'
         run: |
           & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
       - run: go get ./...
       - run: |
           $gopath=(get-command go).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           $env:PATH="$gopath;$env:PATH"
           $env:OLLAMA_SKIP_CPU_GENERATE="1"
           $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          go generate -x ./...
-        name: go generate
-        env:
-          OLLAMA_SKIP_CPU_GENERATE: '1'
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
+          write-host $env:HIP_PATH
+          make -C llama print-HIP_PATH print-HIP_LIB_DIR
+          make -j $cores rocm
+        name: make
 
   # CUDA generation step
-  generate-windows-cuda:
+  runners-windows-cuda:
     needs: [changes]
-    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
+    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
     runs-on: windows
     steps:
       - uses: actions/checkout@v4
@@ -201,21 +169,40 @@ jobs:
           echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV
       - name: 'Verify CUDA'
         run: nvcc -V
+      - name: Install msys2
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
       - run: go get ./...
-      - name: go generate
+      - name: make
         run: |
           $gopath=(get-command go).source | split-path -parent
           $cudabin=(get-command nvcc).source | split-path
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
           $env:PATH="$gopath;$cudabin;$env:PATH"
           $env:OLLAMA_SKIP_CPU_GENERATE="1"
-          go generate -x ./...
+          $cores = (Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores
+          make -j $cores cuda_v11
         env:
           OLLAMA_SKIP_CPU_GENERATE: '1'
 
-  runners:
+  runners-cpu:
     needs: [changes]
     if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
     strategy:
@@ -239,20 +226,41 @@ jobs:
           go-version-file: go.mod
           cache: true
       - run: go get ./...
+      - name: Install msys2
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        run: |
+          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
+          write-host "Downloading msys2"
+          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
+          write-host "Installing msys2"
+          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
+          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install msys2 tools
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        run: |
+          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
+          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: verify tools
+        if: ${{ startsWith(matrix.os, 'windows-') }}
+        run: |
+          get-command gcc
+          gcc --version
+          get-command make
+          make --version
       - name: 'Build Windows Go Runners'
         if: ${{ startsWith(matrix.os, 'windows-') }}
         run: |
           $gopath=(get-command go).source | split-path -parent
           $gccpath=(get-command gcc).source | split-path -parent
-          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
-          cd $env:GITHUB_WORKSPACE
+          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
           $env:PATH="$gopath;$gccpath;$env:PATH"
           echo $env:PATH
-          make -C llama -j 4      
+          make -j 4      
       - name: 'Build Unix Go Runners'
         if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        run: make -C llama -j 4
+        run: make -j 4
       - run: go build .
 
   lint:
@@ -302,9 +310,6 @@ jobs:
     env:
       GOARCH: ${{ matrix.arch }}
       CGO_ENABLED: '1'
-      OLLAMA_CPU_TARGET: 'static'
-      OLLAMA_SKIP_CPU_GENERATE: '1'
-      OLLAMA_SKIP_METAL_GENERATE: '1'
     steps:
       - uses: actions/checkout@v4
         with:
@@ -319,7 +324,6 @@ jobs:
             arm64) echo ARCH=arm64 ;;
           esac >>$GITHUB_ENV
         shell: bash
-      - run: go generate ./...
       - run: go build
       - run: go test -v ./...
 
@@ -333,4 +337,4 @@ jobs:
           submodules: recursive
       - name: Verify patches carry all the changes
         run: |
-          cd llama && make apply-patches sync && git diff --compact-summary --exit-code .
\ No newline at end of file
+          make apply-patches sync && git diff --compact-summary --exit-code llama
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index b92f645d..00000000
--- a/.gitmodules
+++ /dev/null
@@ -1,4 +0,0 @@
-[submodule "llama.cpp"]
-	path = llm/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
-	shallow = true
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index bb0f684b..16d1e4be 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,168 +6,134 @@ ARG CUDA_VERSION_12=12.4.0
 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
 
-# Copy the minimal context we need to run the generate scripts
-FROM scratch AS llm-code
-COPY .git .git
-COPY .gitmodules .gitmodules
-COPY llm llm
-
-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
-ARG CMAKE_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG CUDA_V11_ARCHITECTURES
-ENV GOARCH=amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
-    CUDA_VARIANT="_v11" \
-    bash gen_linux.sh
-
-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
-ARG CMAKE_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG CUDA_V12_ARCHITECTURES
-ENV GOARCH=amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
-    CUDA_VARIANT="_v12" \
-    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
-    bash gen_linux.sh
-
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64
-ARG CMAKE_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG CUDA_V11_ARCHITECTURES
-ENV GOARCH=arm64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
-    CUDA_VARIANT="_v11" \
-    bash gen_linux.sh
-
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64
-ARG CMAKE_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG CUDA_V12_ARCHITECTURES
-ENV GOARCH=arm64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
-    CUDA_VARIANT="_v12" \
-    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
-    bash gen_linux.sh
-
-
-FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
-ARG CMAKE_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
-ENV LIBRARY_PATH=/opt/amdgpu/lib64
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG AMDGPU_TARGETS
-ENV GOARCH=amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
-RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \
-    (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - )
-
-FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64
+### To create a local image for building linux binaries on mac or windows with efficient incremental builds
+#
+# docker build --platform linux/amd64 -t builder-amd64 -f Dockerfile --target unified-builder-amd64 .
+# docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
+#
+### Then incremental builds will be much faster in this container
+#
+# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
+#
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
+ARG CUDA_VERSION_11
+ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
+ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-ARG OLLAMA_CUSTOM_CPU_DEFS
-ARG CGO_CFLAGS
-ENV GOARCH=amd64
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
+    dnf clean all && \
+    dnf install -y \
+    zsh \
+    cuda-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
+    cuda-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
+# TODO intel oneapi goes here...
+ENV GOARCH amd64
+ENV CGO_ENABLED 1
+WORKDIR /go/src/github.com/ollama/ollama/
+ENTRYPOINT [ "zsh" ]
 
-FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
-FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh
-FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh
-
-FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
+### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
+# Note: this does not contain jetson variants
+#
+# docker build --platform linux/arm64 -t builder-arm64 -f Dockerfile --target unified-builder-arm64 .
+# docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
+#
+FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
+ARG CUDA_VERSION_11
+ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-ARG OLLAMA_CUSTOM_CPU_DEFS
-ARG CGO_CFLAGS
-ENV GOARCH=arm64
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
+    dnf config-manager --set-enabled appstream && \
+    dnf clean all && \
+    dnf install -y \
+    zsh \
+    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
+    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
+ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
+ENV GOARCH amd64
+ENV CGO_ENABLED 1
+WORKDIR /go/src/github.com/ollama/ollama/
+ENTRYPOINT [ "zsh" ]
 
-FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
+FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
+COPY . .
+ARG OLLAMA_SKIP_CUDA_GENERATE
+ARG OLLAMA_SKIP_CUDA_11_GENERATE
+ARG OLLAMA_SKIP_CUDA_12_GENERATE
+ARG OLLAMA_SKIP_ROCM_GENERATE
+ARG CUDA_V11_ARCHITECTURES
+ARG CUDA_V12_ARCHITECTURES
+ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
+    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
+        make -C llama -j $(expr $(nproc) / 2 ) ; \
+    else \
+        make -C llama -j 5 ; \
+    fi
+
+FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
+COPY . .
+ARG OLLAMA_SKIP_CUDA_GENERATE
+ARG OLLAMA_SKIP_CUDA_11_GENERATE
+ARG OLLAMA_SKIP_CUDA_12_GENERATE
+ARG CUDA_V11_ARCHITECTURES
+ARG CUDA_V12_ARCHITECTURES
+ARG OLLAMA_FAST_BUILD
+RUN --mount=type=cache,target=/root/.ccache \
+    make -C llama -j 8
 
 
 # Intermediate stages used for ./scripts/build_linux.sh
-FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
-ENV CGO_ENABLED=1
+FROM --platform=linux/amd64 centos:7 AS builder-amd64
+ARG CMAKE_VERSION
+ARG GOLANG_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+ENV CGO_ENABLED 1
+ENV GOARCH amd64
 WORKDIR /go/src/github.com/ollama/ollama
+
+FROM --platform=linux/amd64 builder-amd64 AS build-amd64
 COPY . .
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
+ARG OLLAMA_SKIP_ROCM_GENERATE
 RUN --mount=type=cache,target=/root/.ccache \
     go build -trimpath -o dist/linux-amd64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
     tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN cd dist/linux-$GOARCH-rocm && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz
+RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
+    cd dist/linux-$GOARCH-rocm && \
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
+    fi
 
-FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
-ENV CGO_ENABLED=1
+FROM --platform=linux/arm64 rockylinux:8 AS builder-arm64
+ARG CMAKE_VERSION
 ARG GOLANG_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+ENV CGO_ENABLED 1
+ENV GOARCH arm64
 WORKDIR /go/src/github.com/ollama/ollama
+
+FROM --platform=linux/arm64 builder-arm64 AS build-arm64
 COPY . .
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
@@ -179,11 +145,11 @@ FROM --platform=linux/amd64 scratch AS dist-amd64
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
 FROM --platform=linux/arm64 scratch AS dist-arm64
 COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM dist-$TARGETARCH as dist
+FROM dist-$TARGETARCH AS dist
 
 
 # Optimized container images do not cary nested payloads
-FROM --platform=linux/amd64 cpu-builder-amd64 AS container-build-amd64
+FROM --platform=linux/amd64 builder-amd64 AS container-build-amd64
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS
@@ -191,7 +157,7 @@ ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
     go build -trimpath -o dist/linux-amd64/bin/ollama .
 
-FROM --platform=linux/arm64 cpu-builder-arm64 AS container-build-arm64
+FROM --platform=linux/arm64 builder-arm64 AS container-build-arm64
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 ARG GOFLAGS
@@ -199,48 +165,52 @@ ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
     go build -trimpath -o dist/linux-arm64/bin/ollama .
 
+# For amd64 container images, filter out cuda/rocm to minimize size
+FROM runners-amd64 AS runners-cuda-amd64
+RUN rm -rf \
+    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
+    ./dist/linux-amd64/lib/ollama/runners/rocm*
+
+FROM runners-amd64 AS runners-rocm-amd64
+RUN rm -rf \
+    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
+    ./dist/linux-amd64/lib/ollama/libcu*.so* \
+    ./dist/linux-amd64/lib/ollama/runners/cuda*
+
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
 RUN apt-get update && \
     apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 
 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
 RUN apt-get update && \
     apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 
 # ROCm libraries larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
 # Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
 # across releases
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
 RUN apt-get update && \
     apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/*
 COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+
 EXPOSE 11434
-ENV OLLAMA_HOST=0.0.0.0
+ENV OLLAMA_HOST 0.0.0.0
 
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
 
 FROM runtime-$TARGETARCH
 EXPOSE 11434
-ENV OLLAMA_HOST=0.0.0.0
+ENV OLLAMA_HOST 0.0.0.0
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..f59e072c
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,4 @@
+GOALS := $(or $(MAKECMDGOALS),all)
+.PHONY: $(GOALS)
+$(GOALS):
+	$(MAKE) -C llama $@
\ No newline at end of file
diff --git a/docs/development.md b/docs/development.md
index 001ca69e..48894920 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -1,183 +1,5 @@
 # Development
 
-> [!IMPORTANT]
-> The `llm` package that loads and runs models is being updated to use a new [Go runner](#transition-to-go-runner): this should only impact a small set of PRs however it does change how the project is built.
-
-Install required tools:
-
-- cmake version 3.24 or higher
-- go version 1.22 or higher
-- gcc version 11.4.0 or higher
-
-### MacOS
-
-```bash
-brew install go cmake gcc
-```
-
-Optionally enable debugging and more verbose logging:
-
-```bash
-# At build time
-export CGO_CFLAGS="-g"
-
-# At runtime
-export OLLAMA_DEBUG=1
-```
-
-Get the required libraries and build the native LLM code:
-
-```bash
-go generate ./...
-```
-
-Then build ollama:
-
-```bash
-go build .
-```
-
-Now you can run `ollama`:
-
-```bash
-./ollama
-```
-
-### Linux
-
-#### Linux CUDA (NVIDIA)
-
-_Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
-
-Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
-development and runtime packages.
-
-Typically the build scripts will auto-detect CUDA, however, if your Linux distro
-or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
-libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
-a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
-
-Then generate dependencies:
-
-```
-go generate ./...
-```
-
-Then build the binary:
-
-```
-go build .
-```
-
-#### Linux ROCm (AMD)
-
-_Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
-
-Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `cmake` and `golang`.
-
-Typically the build scripts will auto-detect ROCm, however, if your Linux distro
-or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `ROCM_PATH` to the location of the ROCm
-install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
-CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
-the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
-
-```
-go generate ./...
-```
-
-Then build the binary:
-
-```
-go build .
-```
-
-ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
-
-#### Advanced CPU Settings
-
-By default, running `go generate ./...` will compile a few different variations
-of the LLM library based on common CPU families and vector math capabilities,
-including a lowest-common-denominator which should run on almost any 64 bit CPU
-somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
-load. If you would like to build a CPU-based build customized for your
-processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
-like to use. For example, to compile an optimized binary for an Intel i9-9880H,
-you might use:
-
-```
-OLLAMA_CUSTOM_CPU_DEFS="-DGGML_AVX=on -DGGML_AVX2=on -DGGML_F16C=on -DGGML_FMA=on" go generate ./...
-go build .
-```
-
-#### Containerized Linux Build
-
-If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
-
-### Windows
-
-Note: The Windows build for Ollama is still under development.
-
-First, install required tools:
-
-- MSVC toolchain - C/C++ and cmake as minimal requirements
-- Go version 1.22 or higher
-- MinGW (pick one variant) with GCC.
-  - [MinGW-w64](https://www.mingw-w64.org/)
-  - [MSYS2](https://www.msys2.org/)
-- The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
-
-Then, build the `ollama` binary:
-
-```powershell
-$env:CGO_ENABLED="1"
-go generate ./...
-go build .
-```
-
-#### Windows CUDA (NVIDIA)
-
-In addition to the common Windows development tools described above, install CUDA after installing MSVC.
-
-- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
-
-
-#### Windows ROCm (AMD Radeon)
-
-In addition to the common Windows development tools described above, install AMDs HIP package after installing MSVC.
-
-- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
-- [Strawberry Perl](https://strawberryperl.com/)
-
-Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
-
-#### Windows arm64
-
-The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want.  To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
-
-```powershell
-import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
-Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
-```
-
-You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
-
-Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment.  Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
-
-```
-pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
-```
-
-You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
-
-
-## Transition to Go runner
-
-The Ollama team is working on moving to a new Go based runner that loads and runs models in a subprocess to replace the previous code under `ext_server`. During this transition period, this new Go runner is "opt in" at build time, and requires using a different approach to build.
-
-After the transition to use the Go server exclusively, both `make` and `go generate` will build the Go runner.
-
 Install required tools:
 
 - go version 1.22 or higher
@@ -201,7 +23,7 @@ export OLLAMA_DEBUG=1
 Get the required libraries and build the native LLM code:  (Adjust the job count based on your number of processors for a faster build)
 
 ```bash
-make -C llama -j 5
+make -j 5
 ```
 
 Then build ollama:
@@ -238,7 +60,7 @@ a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "
 Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
 
 ```
-make -C llama -j 5
+make -j 5
 ```
 
 Then build the binary:
@@ -263,7 +85,7 @@ the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx
 Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
 
 ```
-make -C llama -j 5
+make -j 5
 ```
 
 Then build the binary:
@@ -308,7 +130,7 @@ Then, build the `ollama` binary:
 
 ```powershell
 $env:CGO_ENABLED="1"
-make -C llama -j 8
+make -j 8
 go build .
 ```
 
diff --git a/llama/Dockerfile b/llama/Dockerfile
deleted file mode 100644
index 4ab58f5d..00000000
--- a/llama/Dockerfile
+++ /dev/null
@@ -1,221 +0,0 @@
-# Note: once we have fully transitioned to the Go server, this will replace the old Dockerfile at the top of the tree
-ARG GOLANG_VERSION=1.22.8
-ARG CMAKE_VERSION=3.22.1
-ARG CUDA_VERSION_11=11.3.1
-ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
-ARG CUDA_VERSION_12=12.4.0
-ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
-ARG ROCM_VERSION=6.1.2
-
-### To create a local image for building linux binaries on mac or windows with efficient incremental builds
-#
-# docker build --platform linux/amd64 -t builder-amd64 -f llama/Dockerfile --target unified-builder-amd64 .
-# docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
-#
-### Then incremental builds will be much faster in this container
-#
-# make -C llama -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
-#
-FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-ARG CUDA_VERSION_11
-ARG CUDA_VERSION_12
-COPY ./scripts/rh_linux_deps.sh /
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
-    dnf clean all && \
-    dnf install -y \
-    zsh \
-    cuda-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
-# TODO intel oneapi goes here...
-ENV GOARCH amd64
-ENV CGO_ENABLED 1
-WORKDIR /go/src/github.com/ollama/ollama/
-ENTRYPOINT [ "zsh" ]
-
-### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
-# Note: this does not contain jetson variants
-#
-# docker build --platform linux/arm64 -t builder-arm64 -f llama/Dockerfile --target unified-builder-arm64 .
-# docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
-#
-FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-ARG CUDA_VERSION_11
-ARG CUDA_VERSION_12
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
-    dnf config-manager --set-enabled appstream && \
-    dnf clean all && \
-    dnf install -y \
-    zsh \
-    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
-ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-ENV GOARCH amd64
-ENV CGO_ENABLED 1
-WORKDIR /go/src/github.com/ollama/ollama/
-ENTRYPOINT [ "zsh" ]
-
-FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
-ARG OLLAMA_SKIP_ROCM_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
-ARG OLLAMA_FAST_BUILD
-RUN --mount=type=cache,target=/root/.ccache \
-    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -C llama -j $(expr $(nproc) / 2 ) ; \
-    else \
-        make -C llama -j 5 ; \
-    fi
-
-FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
-ARG OLLAMA_FAST_BUILD
-RUN --mount=type=cache,target=/root/.ccache \
-    make -C llama -j 8
-
-
-# Intermediate stages used for ./scripts/build_linux.sh
-FROM --platform=linux/amd64 centos:7 AS builder-amd64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-ENV CGO_ENABLED 1
-ENV GOARCH amd64
-WORKDIR /go/src/github.com/ollama/ollama
-
-FROM --platform=linux/amd64 builder-amd64 AS build-amd64
-COPY . .
-COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/build/ build/
-ARG GOFLAGS
-ARG CGO_CFLAGS
-ARG OLLAMA_SKIP_ROCM_GENERATE
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/bin/ollama .
-RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
-    cd dist/linux-$GOARCH-rocm && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
-    fi
-
-FROM --platform=linux/arm64 rockylinux:8 AS builder-arm64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-ENV CGO_ENABLED 1
-ENV GOARCH arm64
-WORKDIR /go/src/github.com/ollama/ollama
-
-FROM --platform=linux/arm64 builder-arm64 AS build-arm64
-COPY . .
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
-RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-
-FROM --platform=linux/amd64 scratch AS dist-amd64
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM --platform=linux/arm64 scratch AS dist-arm64
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM dist-$TARGETARCH AS dist
-
-
-# Optimized container images do not cary nested payloads
-FROM --platform=linux/amd64 builder-amd64 AS container-build-amd64
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/bin/ollama .
-
-FROM --platform=linux/arm64 builder-arm64 AS container-build-arm64
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
-
-# For amd64 container images, filter out cuda/rocm to minimize size
-FROM runners-amd64 AS runners-cuda-amd64
-RUN rm -rf \
-    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
-    ./dist/linux-amd64/lib/ollama/runners/rocm*
-
-FROM runners-amd64 AS runners-rocm-amd64
-RUN rm -rf \
-    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
-    ./dist/linux-amd64/lib/ollama/libcu*.so* \
-    ./dist/linux-amd64/lib/ollama/runners/cuda*
-
-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-
-FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-
-# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
-# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
-# across releases
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-
-EXPOSE 11434
-ENV OLLAMA_HOST 0.0.0.0
-
-ENTRYPOINT ["/bin/ollama"]
-CMD ["serve"]
-
-FROM runtime-$TARGETARCH
-EXPOSE 11434
-ENV OLLAMA_HOST 0.0.0.0
-ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
-ENV NVIDIA_VISIBLE_DEVICES=all
-
-ENTRYPOINT ["/bin/ollama"]
-CMD ["serve"]
diff --git a/llama/README.md b/llama/README.md
index a2091036..ec54b989 100644
--- a/llama/README.md
+++ b/llama/README.md
@@ -95,31 +95,17 @@ make -j
 
 Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model.  While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit.  A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
 
-> [!IMPORTANT]
-> Prior to merging #7157 we continue to leverage a submodule for llama.cpp which establishes the tracking commit.  After merging that PR a new manifest file we be utilized
-
 If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.
 
 ```
-make -C llama apply-patches
+make apply-patches
 ```
 
 ### Updating Base Commit
 
 **Pin to new base commit**
 
-To update to a newer base commit, select the upstream git tag or commit
-
-> [!IMPORTANT]
-> After merging #7157 a manifest will be used instead of the submodule
-
-```
-cd llm/llama.cpp
-git fetch
-git checkout NEW_BASE_COMMIT
-cd ..
-git add llama.cpp
-```
+To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring.env`
 
 #### Applying patches
 
@@ -128,13 +114,13 @@ When updating to a newer base commit, the existing patches may not apply cleanly
 Start by applying the patches.  If any of the patches have conflicts, the `git am` will stop at the first failure.
 
 ```
-make -C llama apply-patches
+make apply-patches
 ```
 
 If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed.  Save the file(s) and continue the patch series with `git am --continue` .  If any additional patches fail, follow the same pattern until the full patch series is applied.  Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
 
 ```
-make -C llama create-patches sync
+make create-patches sync
 ```
 
 Build and test Ollama, and make any necessary changes to the Go code based on the new base commit.  Submit your PR to the Ollama repo.
@@ -144,14 +130,14 @@ Build and test Ollama, and make any necessary changes to the Go code based on th
 When working on new fixes or features that impact vendored code, use the following model.  First get a clean tracking repo with all current patches applied:
 
 ```
-make -C llama apply-patches
+make apply-patches
 ```
 
 Now edit the upstream native code in the `./vendor/` directory.  You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing.  Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
 
 ```
-make -C llama sync
-make -C llama -j 8
+make sync
+make -j 8
 go build .
 ```
 
@@ -161,7 +147,7 @@ go build .
 Iterate until you're ready to submit PRs.  Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
 
 ```
-make -C llama create-patches
+make create-patches
 ```
 
 > [!IMPORTANT]
diff --git a/llama/llama.go b/llama/llama.go
index f7c0f362..54f4de9a 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -1,5 +1,7 @@
 package llama
 
+//go:generate make -j 8
+
 /*
 #cgo CFLAGS: -O2 -std=c11 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE
 #cgo CXXFLAGS: -O2 -std=c++11 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE
diff --git a/llama/make/Makefile.sync b/llama/make/Makefile.sync
index 58f7ef18..a6a7124f 100644
--- a/llama/make/Makefile.sync
+++ b/llama/make/Makefile.sync
@@ -1,11 +1,12 @@
 # Helpers for managing our vendored llama.cpp repo and patch set
 
-# TODO - this should include a manifest file at the top of the tree 
-LLAMACPP_BASE_COMMIT=$(shell cd ../llm/llama.cpp && git rev-parse HEAD)
+REPO_ROOT:=$(dir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
+DST_DIR:=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
 
-LLAMACPP_REPO := $(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))vendor/
+include $(REPO_ROOT)llama/vendoring
+
+LLAMACPP_REPO := $(REPO_ROOT)llama/vendor/
 
-DST_DIR=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
 LLAMACPP_PATCH_DIR := $(DST_DIR)patches/
 
 
diff --git a/llama/vendoring b/llama/vendoring
new file mode 100644
index 00000000..87e0a9a3
--- /dev/null
+++ b/llama/vendoring
@@ -0,0 +1 @@
+LLAMACPP_BASE_COMMIT=3f1ae2e32cde00c39b96be6d01c2997c29bae555
\ No newline at end of file
diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt
deleted file mode 100644
index 51730245..00000000
--- a/llm/ext_server/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-set(TARGET ollama_llama_server)
-option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp utils.hpp httplib.h)
-install(TARGETS ${TARGET} RUNTIME)
-target_compile_definitions(${TARGET} PRIVATE
-    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
-)
-target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
-if (WIN32)
-    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
-    target_link_options(${TARGET} PRIVATE -municode -Wl,/subsystem:console)
-endif()
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
\ No newline at end of file
diff --git a/llm/ext_server/httplib.h b/llm/ext_server/httplib.h
deleted file mode 100644
index 28746000..00000000
--- a/llm/ext_server/httplib.h
+++ /dev/null
@@ -1,8794 +0,0 @@
-//
-//  httplib.h
-//
-//  Copyright (c) 2023 Yuji Hirose. All rights reserved.
-//  MIT License
-//
-
-#ifndef CPPHTTPLIB_HTTPLIB_H
-#define CPPHTTPLIB_HTTPLIB_H
-
-#define CPPHTTPLIB_VERSION "0.12.2"
-
-/*
- * Configuration
- */
-
-#ifndef CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND
-#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5
-#endif
-
-#ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
-#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5
-#endif
-
-#ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND
-#define CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND 300
-#endif
-
-#ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND
-#define CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND 0
-#endif
-
-#ifndef CPPHTTPLIB_READ_TIMEOUT_SECOND
-#define CPPHTTPLIB_READ_TIMEOUT_SECOND 5
-#endif
-
-#ifndef CPPHTTPLIB_READ_TIMEOUT_USECOND
-#define CPPHTTPLIB_READ_TIMEOUT_USECOND 0
-#endif
-
-#ifndef CPPHTTPLIB_WRITE_TIMEOUT_SECOND
-#define CPPHTTPLIB_WRITE_TIMEOUT_SECOND 5
-#endif
-
-#ifndef CPPHTTPLIB_WRITE_TIMEOUT_USECOND
-#define CPPHTTPLIB_WRITE_TIMEOUT_USECOND 0
-#endif
-
-#ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND
-#define CPPHTTPLIB_IDLE_INTERVAL_SECOND 0
-#endif
-
-#ifndef CPPHTTPLIB_IDLE_INTERVAL_USECOND
-#ifdef _WIN32
-#define CPPHTTPLIB_IDLE_INTERVAL_USECOND 10000
-#else
-#define CPPHTTPLIB_IDLE_INTERVAL_USECOND 0
-#endif
-#endif
-
-#ifndef CPPHTTPLIB_REQUEST_URI_MAX_LENGTH
-#define CPPHTTPLIB_REQUEST_URI_MAX_LENGTH 8192
-#endif
-
-#ifndef CPPHTTPLIB_HEADER_MAX_LENGTH
-#define CPPHTTPLIB_HEADER_MAX_LENGTH 8192
-#endif
-
-#ifndef CPPHTTPLIB_REDIRECT_MAX_COUNT
-#define CPPHTTPLIB_REDIRECT_MAX_COUNT 20
-#endif
-
-#ifndef CPPHTTPLIB_MULTIPART_FORM_DATA_FILE_MAX_COUNT
-#define CPPHTTPLIB_MULTIPART_FORM_DATA_FILE_MAX_COUNT 1024
-#endif
-
-#ifndef CPPHTTPLIB_PAYLOAD_MAX_LENGTH
-#define CPPHTTPLIB_PAYLOAD_MAX_LENGTH ((std::numeric_limits<size_t>::max)())
-#endif
-
-#ifndef CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH
-#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 8192
-#endif
-
-#ifndef CPPHTTPLIB_TCP_NODELAY
-#define CPPHTTPLIB_TCP_NODELAY false
-#endif
-
-#ifndef CPPHTTPLIB_RECV_BUFSIZ
-#define CPPHTTPLIB_RECV_BUFSIZ size_t(4096u)
-#endif
-
-#ifndef CPPHTTPLIB_COMPRESSION_BUFSIZ
-#define CPPHTTPLIB_COMPRESSION_BUFSIZ size_t(16384u)
-#endif
-
-#ifndef CPPHTTPLIB_THREAD_POOL_COUNT
-#define CPPHTTPLIB_THREAD_POOL_COUNT                                           \
-  ((std::max)(8u, std::thread::hardware_concurrency() > 0                      \
-                      ? std::thread::hardware_concurrency() - 1                \
-                      : 0))
-#endif
-
-#ifndef CPPHTTPLIB_RECV_FLAGS
-#define CPPHTTPLIB_RECV_FLAGS 0
-#endif
-
-#ifndef CPPHTTPLIB_SEND_FLAGS
-#define CPPHTTPLIB_SEND_FLAGS 0
-#endif
-
-#ifndef CPPHTTPLIB_LISTEN_BACKLOG
-#define CPPHTTPLIB_LISTEN_BACKLOG 5
-#endif
-
-/*
- * Headers
- */
-
-#ifdef _WIN32
-#ifndef _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_WARNINGS
-#endif //_CRT_SECURE_NO_WARNINGS
-
-#ifndef _CRT_NONSTDC_NO_DEPRECATE
-#define _CRT_NONSTDC_NO_DEPRECATE
-#endif //_CRT_NONSTDC_NO_DEPRECATE
-
-#if defined(_MSC_VER)
-#if _MSC_VER < 1900
-#error Sorry, Visual Studio versions prior to 2015 are not supported
-#endif
-
-#pragma comment(lib, "ws2_32.lib")
-
-#ifdef _WIN64
-using ssize_t = __int64;
-#else
-using ssize_t = long;
-#endif
-#endif // _MSC_VER
-
-#ifndef S_ISREG
-#define S_ISREG(m) (((m)&S_IFREG) == S_IFREG)
-#endif // S_ISREG
-
-#ifndef S_ISDIR
-#define S_ISDIR(m) (((m)&S_IFDIR) == S_IFDIR)
-#endif // S_ISDIR
-
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif // NOMINMAX
-
-#include <io.h>
-#include <winsock2.h>
-#include <ws2tcpip.h>
-
-#ifndef WSA_FLAG_NO_HANDLE_INHERIT
-#define WSA_FLAG_NO_HANDLE_INHERIT 0x80
-#endif
-
-#ifndef strcasecmp
-#define strcasecmp _stricmp
-#endif // strcasecmp
-
-using socket_t = SOCKET;
-#ifdef CPPHTTPLIB_USE_POLL
-#define poll(fds, nfds, timeout) WSAPoll(fds, nfds, timeout)
-#endif
-
-#else // not _WIN32
-
-#include <arpa/inet.h>
-#ifndef _AIX
-#include <ifaddrs.h>
-#endif
-#include <net/if.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#ifdef __linux__
-#include <resolv.h>
-#endif
-#include <netinet/tcp.h>
-#ifdef CPPHTTPLIB_USE_POLL
-#include <poll.h>
-#endif
-#include <csignal>
-#include <pthread.h>
-#include <sys/select.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <unistd.h>
-
-using socket_t = int;
-#ifndef INVALID_SOCKET
-#define INVALID_SOCKET (-1)
-#endif
-#endif //_WIN32
-
-#include <algorithm>
-#include <array>
-#include <atomic>
-#include <cassert>
-#include <cctype>
-#include <climits>
-#include <condition_variable>
-#include <cstring>
-#include <errno.h>
-#include <fcntl.h>
-#include <fstream>
-#include <functional>
-#include <iomanip>
-#include <iostream>
-#include <list>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <random>
-#include <regex>
-#include <set>
-#include <sstream>
-#include <string>
-#include <sys/stat.h>
-#include <thread>
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-#ifdef _WIN32
-#include <wincrypt.h>
-
-// these are defined in wincrypt.h and it breaks compilation if BoringSSL is
-// used
-#undef X509_NAME
-#undef X509_CERT_PAIR
-#undef X509_EXTENSIONS
-#undef PKCS7_SIGNER_INFO
-
-#ifdef _MSC_VER
-#pragma comment(lib, "crypt32.lib")
-#pragma comment(lib, "cryptui.lib")
-#endif
-#elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && defined(__APPLE__)
-#include <TargetConditionals.h>
-#if TARGET_OS_OSX
-#include <CoreFoundation/CoreFoundation.h>
-#include <Security/Security.h>
-#endif // TARGET_OS_OSX
-#endif // _WIN32
-
-#include <openssl/err.h>
-#include <openssl/evp.h>
-#include <openssl/ssl.h>
-#include <openssl/x509v3.h>
-
-#if defined(_WIN32) && defined(OPENSSL_USE_APPLINK)
-#include <openssl/applink.c>
-#endif
-
-#include <iostream>
-#include <sstream>
-
-#if OPENSSL_VERSION_NUMBER < 0x1010100fL
-#error Sorry, OpenSSL versions prior to 1.1.1 are not supported
-#elif OPENSSL_VERSION_NUMBER < 0x30000000L
-#define SSL_get1_peer_certificate SSL_get_peer_certificate
-#endif
-
-#endif
-
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-#include <zlib.h>
-#endif
-
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-#include <brotli/decode.h>
-#include <brotli/encode.h>
-#endif
-
-/*
- * Declaration
- */
-namespace httplib {
-
-namespace detail {
-
-/*
- * Backport std::make_unique from C++14.
- *
- * NOTE: This code came up with the following stackoverflow post:
- * https://stackoverflow.com/questions/10149840/c-arrays-and-make-unique
- *
- */
-
-template <class T, class... Args>
-typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(Args &&...args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
-template <class T>
-typename std::enable_if<std::is_array<T>::value, std::unique_ptr<T>>::type
-make_unique(std::size_t n) {
-  typedef typename std::remove_extent<T>::type RT;
-  return std::unique_ptr<T>(new RT[n]);
-}
-
-struct ci {
-  bool operator()(const std::string &s1, const std::string &s2) const {
-    return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(),
-                                        s2.end(),
-                                        [](unsigned char c1, unsigned char c2) {
-                                          return ::tolower(c1) < ::tolower(c2);
-                                        });
-  }
-};
-
-// This is based on
-// "http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4189".
-
-struct scope_exit {
-  explicit scope_exit(std::function<void(void)> &&f)
-      : exit_function(std::move(f)), execute_on_destruction{true} {}
-
-  scope_exit(scope_exit &&rhs)
-      : exit_function(std::move(rhs.exit_function)),
-        execute_on_destruction{rhs.execute_on_destruction} {
-    rhs.release();
-  }
-
-  ~scope_exit() {
-    if (execute_on_destruction) { this->exit_function(); }
-  }
-
-  void release() { this->execute_on_destruction = false; }
-
-private:
-  scope_exit(const scope_exit &) = delete;
-  void operator=(const scope_exit &) = delete;
-  scope_exit &operator=(scope_exit &&) = delete;
-
-  std::function<void(void)> exit_function;
-  bool execute_on_destruction;
-};
-
-} // namespace detail
-
-using Headers = std::multimap<std::string, std::string, detail::ci>;
-
-using Params = std::multimap<std::string, std::string>;
-using Match = std::smatch;
-
-using Progress = std::function<bool(uint64_t current, uint64_t total)>;
-
-struct Response;
-using ResponseHandler = std::function<bool(const Response &response)>;
-
-struct MultipartFormData {
-  std::string name;
-  std::string content;
-  std::string filename;
-  std::string content_type;
-};
-using MultipartFormDataItems = std::vector<MultipartFormData>;
-using MultipartFormDataMap = std::multimap<std::string, MultipartFormData>;
-
-class DataSink {
-public:
-  DataSink() : os(&sb_), sb_(*this) {}
-
-  DataSink(const DataSink &) = delete;
-  DataSink &operator=(const DataSink &) = delete;
-  DataSink(DataSink &&) = delete;
-  DataSink &operator=(DataSink &&) = delete;
-
-  std::function<bool(const char *data, size_t data_len)> write;
-  std::function<void()> done;
-  std::function<void(const Headers &trailer)> done_with_trailer;
-  std::ostream os;
-
-private:
-  class data_sink_streambuf : public std::streambuf {
-  public:
-    explicit data_sink_streambuf(DataSink &sink) : sink_(sink) {}
-
-  protected:
-    std::streamsize xsputn(const char *s, std::streamsize n) {
-      sink_.write(s, static_cast<size_t>(n));
-      return n;
-    }
-
-  private:
-    DataSink &sink_;
-  };
-
-  data_sink_streambuf sb_;
-};
-
-using ContentProvider =
-    std::function<bool(size_t offset, size_t length, DataSink &sink)>;
-
-using ContentProviderWithoutLength =
-    std::function<bool(size_t offset, DataSink &sink)>;
-
-using ContentProviderResourceReleaser = std::function<void(bool success)>;
-
-struct MultipartFormDataProvider {
-  std::string name;
-  ContentProviderWithoutLength provider;
-  std::string filename;
-  std::string content_type;
-};
-using MultipartFormDataProviderItems = std::vector<MultipartFormDataProvider>;
-
-using ContentReceiverWithProgress =
-    std::function<bool(const char *data, size_t data_length, uint64_t offset,
-                       uint64_t total_length)>;
-
-using ContentReceiver =
-    std::function<bool(const char *data, size_t data_length)>;
-
-using MultipartContentHeader =
-    std::function<bool(const MultipartFormData &file)>;
-
-class ContentReader {
-public:
-  using Reader = std::function<bool(ContentReceiver receiver)>;
-  using MultipartReader = std::function<bool(MultipartContentHeader header,
-                                             ContentReceiver receiver)>;
-
-  ContentReader(Reader reader, MultipartReader multipart_reader)
-      : reader_(std::move(reader)),
-        multipart_reader_(std::move(multipart_reader)) {}
-
-  bool operator()(MultipartContentHeader header,
-                  ContentReceiver receiver) const {
-    return multipart_reader_(std::move(header), std::move(receiver));
-  }
-
-  bool operator()(ContentReceiver receiver) const {
-    return reader_(std::move(receiver));
-  }
-
-  Reader reader_;
-  MultipartReader multipart_reader_;
-};
-
-using Range = std::pair<ssize_t, ssize_t>;
-using Ranges = std::vector<Range>;
-
-struct Request {
-  std::string method;
-  std::string path;
-  Headers headers;
-  std::string body;
-
-  std::string remote_addr;
-  int remote_port = -1;
-  std::string local_addr;
-  int local_port = -1;
-
-  // for server
-  std::string version;
-  std::string target;
-  Params params;
-  MultipartFormDataMap files;
-  Ranges ranges;
-  Match matches;
-
-  // for client
-  ResponseHandler response_handler;
-  ContentReceiverWithProgress content_receiver;
-  Progress progress;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  const SSL *ssl = nullptr;
-#endif
-
-  bool has_header(const std::string &key) const;
-  std::string get_header_value(const std::string &key, size_t id = 0) const;
-  template <typename T>
-  T get_header_value(const std::string &key, size_t id = 0) const;
-  size_t get_header_value_count(const std::string &key) const;
-  void set_header(const std::string &key, const std::string &val);
-
-  bool has_param(const std::string &key) const;
-  std::string get_param_value(const std::string &key, size_t id = 0) const;
-  size_t get_param_value_count(const std::string &key) const;
-
-  bool is_multipart_form_data() const;
-
-  bool has_file(const std::string &key) const;
-  MultipartFormData get_file_value(const std::string &key) const;
-  std::vector<MultipartFormData> get_file_values(const std::string &key) const;
-
-  // private members...
-  size_t redirect_count_ = CPPHTTPLIB_REDIRECT_MAX_COUNT;
-  size_t content_length_ = 0;
-  ContentProvider content_provider_;
-  bool is_chunked_content_provider_ = false;
-  size_t authorization_count_ = 0;
-};
-
-struct Response {
-  std::string version;
-  int status = -1;
-  std::string reason;
-  Headers headers;
-  std::string body;
-  std::string location; // Redirect location
-
-  bool has_header(const std::string &key) const;
-  std::string get_header_value(const std::string &key, size_t id = 0) const;
-  template <typename T>
-  T get_header_value(const std::string &key, size_t id = 0) const;
-  size_t get_header_value_count(const std::string &key) const;
-  void set_header(const std::string &key, const std::string &val);
-
-  void set_redirect(const std::string &url, int status = 302);
-  void set_content(const char *s, size_t n, const std::string &content_type);
-  void set_content(const std::string &s, const std::string &content_type);
-
-  void set_content_provider(
-      size_t length, const std::string &content_type, ContentProvider provider,
-      ContentProviderResourceReleaser resource_releaser = nullptr);
-
-  void set_content_provider(
-      const std::string &content_type, ContentProviderWithoutLength provider,
-      ContentProviderResourceReleaser resource_releaser = nullptr);
-
-  void set_chunked_content_provider(
-      const std::string &content_type, ContentProviderWithoutLength provider,
-      ContentProviderResourceReleaser resource_releaser = nullptr);
-
-  Response() = default;
-  Response(const Response &) = default;
-  Response &operator=(const Response &) = default;
-  Response(Response &&) = default;
-  Response &operator=(Response &&) = default;
-  ~Response() {
-    if (content_provider_resource_releaser_) {
-      content_provider_resource_releaser_(content_provider_success_);
-    }
-  }
-
-  // private members...
-  size_t content_length_ = 0;
-  ContentProvider content_provider_;
-  ContentProviderResourceReleaser content_provider_resource_releaser_;
-  bool is_chunked_content_provider_ = false;
-  bool content_provider_success_ = false;
-};
-
-class Stream {
-public:
-  virtual ~Stream() = default;
-
-  virtual bool is_readable() const = 0;
-  virtual bool is_writable() const = 0;
-
-  virtual ssize_t read(char *ptr, size_t size) = 0;
-  virtual ssize_t write(const char *ptr, size_t size) = 0;
-  virtual void get_remote_ip_and_port(std::string &ip, int &port) const = 0;
-  virtual void get_local_ip_and_port(std::string &ip, int &port) const = 0;
-  virtual socket_t socket() const = 0;
-
-  template <typename... Args>
-  ssize_t write_format(const char *fmt, const Args &...args);
-  ssize_t write(const char *ptr);
-  ssize_t write(const std::string &s);
-};
-
-class TaskQueue {
-public:
-  TaskQueue() = default;
-  virtual ~TaskQueue() = default;
-
-  virtual void enqueue(std::function<void()> fn) = 0;
-  virtual void shutdown() = 0;
-
-  virtual void on_idle() {}
-};
-
-class ThreadPool : public TaskQueue {
-public:
-  explicit ThreadPool(size_t n) : shutdown_(false) {
-    while (n) {
-      threads_.emplace_back(worker(*this));
-      n--;
-    }
-  }
-
-  ThreadPool(const ThreadPool &) = delete;
-  ~ThreadPool() override = default;
-
-  void enqueue(std::function<void()> fn) override {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      jobs_.push_back(std::move(fn));
-    }
-
-    cond_.notify_one();
-  }
-
-  void shutdown() override {
-    // Stop all worker threads...
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      shutdown_ = true;
-    }
-
-    cond_.notify_all();
-
-    // Join...
-    for (auto &t : threads_) {
-      t.join();
-    }
-  }
-
-private:
-  struct worker {
-    explicit worker(ThreadPool &pool) : pool_(pool) {}
-
-    void operator()() {
-      for (;;) {
-        std::function<void()> fn;
-        {
-          std::unique_lock<std::mutex> lock(pool_.mutex_);
-
-          pool_.cond_.wait(
-              lock, [&] { return !pool_.jobs_.empty() || pool_.shutdown_; });
-
-          if (pool_.shutdown_ && pool_.jobs_.empty()) { break; }
-
-          fn = std::move(pool_.jobs_.front());
-          pool_.jobs_.pop_front();
-        }
-
-        assert(true == static_cast<bool>(fn));
-        fn();
-      }
-    }
-
-    ThreadPool &pool_;
-  };
-  friend struct worker;
-
-  std::vector<std::thread> threads_;
-  std::list<std::function<void()>> jobs_;
-
-  bool shutdown_;
-
-  std::condition_variable cond_;
-  std::mutex mutex_;
-};
-
-using Logger = std::function<void(const Request &, const Response &)>;
-
-using SocketOptions = std::function<void(socket_t sock)>;
-
-void default_socket_options(socket_t sock);
-
-class Server {
-public:
-  using Handler = std::function<void(const Request &, Response &)>;
-
-  using ExceptionHandler =
-      std::function<void(const Request &, Response &, std::exception_ptr ep)>;
-
-  enum class HandlerResponse {
-    Handled,
-    Unhandled,
-  };
-  using HandlerWithResponse =
-      std::function<HandlerResponse(const Request &, Response &)>;
-
-  using HandlerWithContentReader = std::function<void(
-      const Request &, Response &, const ContentReader &content_reader)>;
-
-  using Expect100ContinueHandler =
-      std::function<int(const Request &, Response &)>;
-
-  Server();
-
-  virtual ~Server();
-
-  virtual bool is_valid() const;
-
-  Server &Get(const std::string &pattern, Handler handler);
-  Server &Post(const std::string &pattern, Handler handler);
-  Server &Post(const std::string &pattern, HandlerWithContentReader handler);
-  Server &Put(const std::string &pattern, Handler handler);
-  Server &Put(const std::string &pattern, HandlerWithContentReader handler);
-  Server &Patch(const std::string &pattern, Handler handler);
-  Server &Patch(const std::string &pattern, HandlerWithContentReader handler);
-  Server &Delete(const std::string &pattern, Handler handler);
-  Server &Delete(const std::string &pattern, HandlerWithContentReader handler);
-  Server &Options(const std::string &pattern, Handler handler);
-
-  bool set_base_dir(const std::string &dir,
-                    const std::string &mount_point = std::string());
-  bool set_mount_point(const std::string &mount_point, const std::string &dir,
-                       Headers headers = Headers());
-  bool remove_mount_point(const std::string &mount_point);
-  Server &set_file_extension_and_mimetype_mapping(const std::string &ext,
-                                                  const std::string &mime);
-  Server &set_file_request_handler(Handler handler);
-
-  Server &set_error_handler(HandlerWithResponse handler);
-  Server &set_error_handler(Handler handler);
-  Server &set_exception_handler(ExceptionHandler handler);
-  Server &set_pre_routing_handler(HandlerWithResponse handler);
-  Server &set_post_routing_handler(Handler handler);
-
-  Server &set_expect_100_continue_handler(Expect100ContinueHandler handler);
-  Server &set_logger(Logger logger);
-
-  Server &set_address_family(int family);
-  Server &set_tcp_nodelay(bool on);
-  Server &set_socket_options(SocketOptions socket_options);
-
-  Server &set_default_headers(Headers headers);
-
-  Server &set_keep_alive_max_count(size_t count);
-  Server &set_keep_alive_timeout(time_t sec);
-
-  Server &set_read_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  Server &set_read_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  Server &set_write_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  Server &set_write_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  Server &set_idle_interval(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  Server &set_idle_interval(const std::chrono::duration<Rep, Period> &duration);
-
-  Server &set_payload_max_length(size_t length);
-
-  bool bind_to_port(const std::string &host, int port, int socket_flags = 0);
-  int bind_to_any_port(const std::string &host, int socket_flags = 0);
-  bool listen_after_bind();
-
-  bool listen(const std::string &host, int port, int socket_flags = 0);
-
-  bool is_running() const;
-  void wait_until_ready() const;
-  void stop();
-
-  std::function<TaskQueue *(void)> new_task_queue;
-
-protected:
-  bool process_request(Stream &strm, bool close_connection,
-                       bool &connection_closed,
-                       const std::function<void(Request &)> &setup_request);
-
-  std::atomic<socket_t> svr_sock_{INVALID_SOCKET};
-  size_t keep_alive_max_count_ = CPPHTTPLIB_KEEPALIVE_MAX_COUNT;
-  time_t keep_alive_timeout_sec_ = CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND;
-  time_t read_timeout_sec_ = CPPHTTPLIB_READ_TIMEOUT_SECOND;
-  time_t read_timeout_usec_ = CPPHTTPLIB_READ_TIMEOUT_USECOND;
-  time_t write_timeout_sec_ = CPPHTTPLIB_WRITE_TIMEOUT_SECOND;
-  time_t write_timeout_usec_ = CPPHTTPLIB_WRITE_TIMEOUT_USECOND;
-  time_t idle_interval_sec_ = CPPHTTPLIB_IDLE_INTERVAL_SECOND;
-  time_t idle_interval_usec_ = CPPHTTPLIB_IDLE_INTERVAL_USECOND;
-  size_t payload_max_length_ = CPPHTTPLIB_PAYLOAD_MAX_LENGTH;
-
-private:
-  using Handlers = std::vector<std::pair<std::regex, Handler>>;
-  using HandlersForContentReader =
-      std::vector<std::pair<std::regex, HandlerWithContentReader>>;
-
-  socket_t create_server_socket(const std::string &host, int port,
-                                int socket_flags,
-                                SocketOptions socket_options) const;
-  int bind_internal(const std::string &host, int port, int socket_flags);
-  bool listen_internal();
-
-  bool routing(Request &req, Response &res, Stream &strm);
-  bool handle_file_request(const Request &req, Response &res,
-                           bool head = false);
-  bool dispatch_request(Request &req, Response &res, const Handlers &handlers);
-  bool
-  dispatch_request_for_content_reader(Request &req, Response &res,
-                                      ContentReader content_reader,
-                                      const HandlersForContentReader &handlers);
-
-  bool parse_request_line(const char *s, Request &req);
-  void apply_ranges(const Request &req, Response &res,
-                    std::string &content_type, std::string &boundary);
-  bool write_response(Stream &strm, bool close_connection, const Request &req,
-                      Response &res);
-  bool write_response_with_content(Stream &strm, bool close_connection,
-                                   const Request &req, Response &res);
-  bool write_response_core(Stream &strm, bool close_connection,
-                           const Request &req, Response &res,
-                           bool need_apply_ranges);
-  bool write_content_with_provider(Stream &strm, const Request &req,
-                                   Response &res, const std::string &boundary,
-                                   const std::string &content_type);
-  bool read_content(Stream &strm, Request &req, Response &res);
-  bool
-  read_content_with_content_receiver(Stream &strm, Request &req, Response &res,
-                                     ContentReceiver receiver,
-                                     MultipartContentHeader multipart_header,
-                                     ContentReceiver multipart_receiver);
-  bool read_content_core(Stream &strm, Request &req, Response &res,
-                         ContentReceiver receiver,
-                         MultipartContentHeader multipart_header,
-                         ContentReceiver multipart_receiver);
-
-  virtual bool process_and_close_socket(socket_t sock);
-
-  struct MountPointEntry {
-    std::string mount_point;
-    std::string base_dir;
-    Headers headers;
-  };
-  std::vector<MountPointEntry> base_dirs_;
-
-  std::atomic<bool> is_running_{false};
-  std::atomic<bool> done_{false};
-  std::map<std::string, std::string> file_extension_and_mimetype_map_;
-  Handler file_request_handler_;
-  Handlers get_handlers_;
-  Handlers post_handlers_;
-  HandlersForContentReader post_handlers_for_content_reader_;
-  Handlers put_handlers_;
-  HandlersForContentReader put_handlers_for_content_reader_;
-  Handlers patch_handlers_;
-  HandlersForContentReader patch_handlers_for_content_reader_;
-  Handlers delete_handlers_;
-  HandlersForContentReader delete_handlers_for_content_reader_;
-  Handlers options_handlers_;
-  HandlerWithResponse error_handler_;
-  ExceptionHandler exception_handler_;
-  HandlerWithResponse pre_routing_handler_;
-  Handler post_routing_handler_;
-  Logger logger_;
-  Expect100ContinueHandler expect_100_continue_handler_;
-
-  int address_family_ = AF_UNSPEC;
-  bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY;
-  SocketOptions socket_options_ = default_socket_options;
-
-  Headers default_headers_;
-};
-
-enum class Error {
-  Success = 0,
-  Unknown,
-  Connection,
-  BindIPAddress,
-  Read,
-  Write,
-  ExceedRedirectCount,
-  Canceled,
-  SSLConnection,
-  SSLLoadingCerts,
-  SSLServerVerification,
-  UnsupportedMultipartBoundaryChars,
-  Compression,
-  ConnectionTimeout,
-
-  // For internal use only
-  SSLPeerCouldBeClosed_,
-};
-
-std::string to_string(const Error error);
-
-std::ostream &operator<<(std::ostream &os, const Error &obj);
-
-class Result {
-public:
-  Result(std::unique_ptr<Response> &&res, Error err,
-         Headers &&request_headers = Headers{})
-      : res_(std::move(res)), err_(err),
-        request_headers_(std::move(request_headers)) {}
-  // Response
-  operator bool() const { return res_ != nullptr; }
-  bool operator==(std::nullptr_t) const { return res_ == nullptr; }
-  bool operator!=(std::nullptr_t) const { return res_ != nullptr; }
-  const Response &value() const { return *res_; }
-  Response &value() { return *res_; }
-  const Response &operator*() const { return *res_; }
-  Response &operator*() { return *res_; }
-  const Response *operator->() const { return res_.get(); }
-  Response *operator->() { return res_.get(); }
-
-  // Error
-  Error error() const { return err_; }
-
-  // Request Headers
-  bool has_request_header(const std::string &key) const;
-  std::string get_request_header_value(const std::string &key,
-                                       size_t id = 0) const;
-  template <typename T>
-  T get_request_header_value(const std::string &key, size_t id = 0) const;
-  size_t get_request_header_value_count(const std::string &key) const;
-
-private:
-  std::unique_ptr<Response> res_;
-  Error err_;
-  Headers request_headers_;
-};
-
-class ClientImpl {
-public:
-  explicit ClientImpl(const std::string &host);
-
-  explicit ClientImpl(const std::string &host, int port);
-
-  explicit ClientImpl(const std::string &host, int port,
-                      const std::string &client_cert_path,
-                      const std::string &client_key_path);
-
-  virtual ~ClientImpl();
-
-  virtual bool is_valid() const;
-
-  Result Get(const std::string &path);
-  Result Get(const std::string &path, const Headers &headers);
-  Result Get(const std::string &path, Progress progress);
-  Result Get(const std::string &path, const Headers &headers,
-             Progress progress);
-  Result Get(const std::string &path, ContentReceiver content_receiver);
-  Result Get(const std::string &path, const Headers &headers,
-             ContentReceiver content_receiver);
-  Result Get(const std::string &path, ContentReceiver content_receiver,
-             Progress progress);
-  Result Get(const std::string &path, const Headers &headers,
-             ContentReceiver content_receiver, Progress progress);
-  Result Get(const std::string &path, ResponseHandler response_handler,
-             ContentReceiver content_receiver);
-  Result Get(const std::string &path, const Headers &headers,
-             ResponseHandler response_handler,
-             ContentReceiver content_receiver);
-  Result Get(const std::string &path, ResponseHandler response_handler,
-             ContentReceiver content_receiver, Progress progress);
-  Result Get(const std::string &path, const Headers &headers,
-             ResponseHandler response_handler, ContentReceiver content_receiver,
-             Progress progress);
-
-  Result Get(const std::string &path, const Params &params,
-             const Headers &headers, Progress progress = nullptr);
-  Result Get(const std::string &path, const Params &params,
-             const Headers &headers, ContentReceiver content_receiver,
-             Progress progress = nullptr);
-  Result Get(const std::string &path, const Params &params,
-             const Headers &headers, ResponseHandler response_handler,
-             ContentReceiver content_receiver, Progress progress = nullptr);
-
-  Result Head(const std::string &path);
-  Result Head(const std::string &path, const Headers &headers);
-
-  Result Post(const std::string &path);
-  Result Post(const std::string &path, const Headers &headers);
-  Result Post(const std::string &path, const char *body, size_t content_length,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers, const char *body,
-              size_t content_length, const std::string &content_type);
-  Result Post(const std::string &path, const std::string &body,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers,
-              const std::string &body, const std::string &content_type);
-  Result Post(const std::string &path, size_t content_length,
-              ContentProvider content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path,
-              ContentProviderWithoutLength content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers,
-              size_t content_length, ContentProvider content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers,
-              ContentProviderWithoutLength content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Params &params);
-  Result Post(const std::string &path, const Headers &headers,
-              const Params &params);
-  Result Post(const std::string &path, const MultipartFormDataItems &items);
-  Result Post(const std::string &path, const Headers &headers,
-              const MultipartFormDataItems &items);
-  Result Post(const std::string &path, const Headers &headers,
-              const MultipartFormDataItems &items, const std::string &boundary);
-  Result Post(const std::string &path, const Headers &headers,
-              const MultipartFormDataItems &items,
-              const MultipartFormDataProviderItems &provider_items);
-
-  Result Put(const std::string &path);
-  Result Put(const std::string &path, const char *body, size_t content_length,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers, const char *body,
-             size_t content_length, const std::string &content_type);
-  Result Put(const std::string &path, const std::string &body,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers,
-             const std::string &body, const std::string &content_type);
-  Result Put(const std::string &path, size_t content_length,
-             ContentProvider content_provider, const std::string &content_type);
-  Result Put(const std::string &path,
-             ContentProviderWithoutLength content_provider,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers,
-             size_t content_length, ContentProvider content_provider,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers,
-             ContentProviderWithoutLength content_provider,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Params &params);
-  Result Put(const std::string &path, const Headers &headers,
-             const Params &params);
-  Result Put(const std::string &path, const MultipartFormDataItems &items);
-  Result Put(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items);
-  Result Put(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items, const std::string &boundary);
-  Result Put(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items,
-             const MultipartFormDataProviderItems &provider_items);
-
-  Result Patch(const std::string &path);
-  Result Patch(const std::string &path, const char *body, size_t content_length,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               const char *body, size_t content_length,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const std::string &body,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               const std::string &body, const std::string &content_type);
-  Result Patch(const std::string &path, size_t content_length,
-               ContentProvider content_provider,
-               const std::string &content_type);
-  Result Patch(const std::string &path,
-               ContentProviderWithoutLength content_provider,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               size_t content_length, ContentProvider content_provider,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               ContentProviderWithoutLength content_provider,
-               const std::string &content_type);
-
-  Result Delete(const std::string &path);
-  Result Delete(const std::string &path, const Headers &headers);
-  Result Delete(const std::string &path, const char *body,
-                size_t content_length, const std::string &content_type);
-  Result Delete(const std::string &path, const Headers &headers,
-                const char *body, size_t content_length,
-                const std::string &content_type);
-  Result Delete(const std::string &path, const std::string &body,
-                const std::string &content_type);
-  Result Delete(const std::string &path, const Headers &headers,
-                const std::string &body, const std::string &content_type);
-
-  Result Options(const std::string &path);
-  Result Options(const std::string &path, const Headers &headers);
-
-  bool send(Request &req, Response &res, Error &error);
-  Result send(const Request &req);
-
-  size_t is_socket_open() const;
-
-  socket_t socket() const;
-
-  void stop();
-
-  void set_hostname_addr_map(std::map<std::string, std::string> addr_map);
-
-  void set_default_headers(Headers headers);
-
-  void set_address_family(int family);
-  void set_tcp_nodelay(bool on);
-  void set_socket_options(SocketOptions socket_options);
-
-  void set_connection_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  void
-  set_connection_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_read_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  void set_read_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_write_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  void set_write_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_basic_auth(const std::string &username, const std::string &password);
-  void set_bearer_token_auth(const std::string &token);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_digest_auth(const std::string &username,
-                       const std::string &password);
-#endif
-
-  void set_keep_alive(bool on);
-  void set_follow_location(bool on);
-
-  void set_url_encode(bool on);
-
-  void set_compress(bool on);
-
-  void set_decompress(bool on);
-
-  void set_interface(const std::string &intf);
-
-  void set_proxy(const std::string &host, int port);
-  void set_proxy_basic_auth(const std::string &username,
-                            const std::string &password);
-  void set_proxy_bearer_token_auth(const std::string &token);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_proxy_digest_auth(const std::string &username,
-                             const std::string &password);
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_ca_cert_path(const std::string &ca_cert_file_path,
-                        const std::string &ca_cert_dir_path = std::string());
-  void set_ca_cert_store(X509_STORE *ca_cert_store);
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void enable_server_certificate_verification(bool enabled);
-#endif
-
-  void set_logger(Logger logger);
-
-protected:
-  struct Socket {
-    socket_t sock = INVALID_SOCKET;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    SSL *ssl = nullptr;
-#endif
-
-    bool is_open() const { return sock != INVALID_SOCKET; }
-  };
-
-  virtual bool create_and_connect_socket(Socket &socket, Error &error);
-
-  // All of:
-  //   shutdown_ssl
-  //   shutdown_socket
-  //   close_socket
-  // should ONLY be called when socket_mutex_ is locked.
-  // Also, shutdown_ssl and close_socket should also NOT be called concurrently
-  // with a DIFFERENT thread sending requests using that socket.
-  virtual void shutdown_ssl(Socket &socket, bool shutdown_gracefully);
-  void shutdown_socket(Socket &socket);
-  void close_socket(Socket &socket);
-
-  bool process_request(Stream &strm, Request &req, Response &res,
-                       bool close_connection, Error &error);
-
-  bool write_content_with_provider(Stream &strm, const Request &req,
-                                   Error &error);
-
-  void copy_settings(const ClientImpl &rhs);
-
-  // Socket endpoint information
-  const std::string host_;
-  const int port_;
-  const std::string host_and_port_;
-
-  // Current open socket
-  Socket socket_;
-  mutable std::mutex socket_mutex_;
-  std::recursive_mutex request_mutex_;
-
-  // These are all protected under socket_mutex
-  size_t socket_requests_in_flight_ = 0;
-  std::thread::id socket_requests_are_from_thread_ = std::thread::id();
-  bool socket_should_be_closed_when_request_is_done_ = false;
-
-  // Hostname-IP map
-  std::map<std::string, std::string> addr_map_;
-
-  // Default headers
-  Headers default_headers_;
-
-  // Settings
-  std::string client_cert_path_;
-  std::string client_key_path_;
-
-  time_t connection_timeout_sec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND;
-  time_t connection_timeout_usec_ = CPPHTTPLIB_CONNECTION_TIMEOUT_USECOND;
-  time_t read_timeout_sec_ = CPPHTTPLIB_READ_TIMEOUT_SECOND;
-  time_t read_timeout_usec_ = CPPHTTPLIB_READ_TIMEOUT_USECOND;
-  time_t write_timeout_sec_ = CPPHTTPLIB_WRITE_TIMEOUT_SECOND;
-  time_t write_timeout_usec_ = CPPHTTPLIB_WRITE_TIMEOUT_USECOND;
-
-  std::string basic_auth_username_;
-  std::string basic_auth_password_;
-  std::string bearer_token_auth_token_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  std::string digest_auth_username_;
-  std::string digest_auth_password_;
-#endif
-
-  bool keep_alive_ = false;
-  bool follow_location_ = false;
-
-  bool url_encode_ = true;
-
-  int address_family_ = AF_UNSPEC;
-  bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY;
-  SocketOptions socket_options_ = nullptr;
-
-  bool compress_ = false;
-  bool decompress_ = true;
-
-  std::string interface_;
-
-  std::string proxy_host_;
-  int proxy_port_ = -1;
-
-  std::string proxy_basic_auth_username_;
-  std::string proxy_basic_auth_password_;
-  std::string proxy_bearer_token_auth_token_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  std::string proxy_digest_auth_username_;
-  std::string proxy_digest_auth_password_;
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  std::string ca_cert_file_path_;
-  std::string ca_cert_dir_path_;
-
-  X509_STORE *ca_cert_store_ = nullptr;
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  bool server_certificate_verification_ = true;
-#endif
-
-  Logger logger_;
-
-private:
-  bool send_(Request &req, Response &res, Error &error);
-  Result send_(Request &&req);
-
-  socket_t create_client_socket(Error &error) const;
-  bool read_response_line(Stream &strm, const Request &req, Response &res);
-  bool write_request(Stream &strm, Request &req, bool close_connection,
-                     Error &error);
-  bool redirect(Request &req, Response &res, Error &error);
-  bool handle_request(Stream &strm, Request &req, Response &res,
-                      bool close_connection, Error &error);
-  std::unique_ptr<Response> send_with_content_provider(
-      Request &req, const char *body, size_t content_length,
-      ContentProvider content_provider,
-      ContentProviderWithoutLength content_provider_without_length,
-      const std::string &content_type, Error &error);
-  Result send_with_content_provider(
-      const std::string &method, const std::string &path,
-      const Headers &headers, const char *body, size_t content_length,
-      ContentProvider content_provider,
-      ContentProviderWithoutLength content_provider_without_length,
-      const std::string &content_type);
-  ContentProviderWithoutLength get_multipart_content_provider(
-      const std::string &boundary, const MultipartFormDataItems &items,
-      const MultipartFormDataProviderItems &provider_items);
-
-  std::string adjust_host_string(const std::string &host) const;
-
-  virtual bool process_socket(const Socket &socket,
-                              std::function<bool(Stream &strm)> callback);
-  virtual bool is_ssl() const;
-};
-
-class Client {
-public:
-  // Universal interface
-  explicit Client(const std::string &scheme_host_port);
-
-  explicit Client(const std::string &scheme_host_port,
-                  const std::string &client_cert_path,
-                  const std::string &client_key_path);
-
-  // HTTP only interface
-  explicit Client(const std::string &host, int port);
-
-  explicit Client(const std::string &host, int port,
-                  const std::string &client_cert_path,
-                  const std::string &client_key_path);
-
-  Client(Client &&) = default;
-
-  ~Client();
-
-  bool is_valid() const;
-
-  Result Get(const std::string &path);
-  Result Get(const std::string &path, const Headers &headers);
-  Result Get(const std::string &path, Progress progress);
-  Result Get(const std::string &path, const Headers &headers,
-             Progress progress);
-  Result Get(const std::string &path, ContentReceiver content_receiver);
-  Result Get(const std::string &path, const Headers &headers,
-             ContentReceiver content_receiver);
-  Result Get(const std::string &path, ContentReceiver content_receiver,
-             Progress progress);
-  Result Get(const std::string &path, const Headers &headers,
-             ContentReceiver content_receiver, Progress progress);
-  Result Get(const std::string &path, ResponseHandler response_handler,
-             ContentReceiver content_receiver);
-  Result Get(const std::string &path, const Headers &headers,
-             ResponseHandler response_handler,
-             ContentReceiver content_receiver);
-  Result Get(const std::string &path, const Headers &headers,
-             ResponseHandler response_handler, ContentReceiver content_receiver,
-             Progress progress);
-  Result Get(const std::string &path, ResponseHandler response_handler,
-             ContentReceiver content_receiver, Progress progress);
-
-  Result Get(const std::string &path, const Params &params,
-             const Headers &headers, Progress progress = nullptr);
-  Result Get(const std::string &path, const Params &params,
-             const Headers &headers, ContentReceiver content_receiver,
-             Progress progress = nullptr);
-  Result Get(const std::string &path, const Params &params,
-             const Headers &headers, ResponseHandler response_handler,
-             ContentReceiver content_receiver, Progress progress = nullptr);
-
-  Result Head(const std::string &path);
-  Result Head(const std::string &path, const Headers &headers);
-
-  Result Post(const std::string &path);
-  Result Post(const std::string &path, const Headers &headers);
-  Result Post(const std::string &path, const char *body, size_t content_length,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers, const char *body,
-              size_t content_length, const std::string &content_type);
-  Result Post(const std::string &path, const std::string &body,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers,
-              const std::string &body, const std::string &content_type);
-  Result Post(const std::string &path, size_t content_length,
-              ContentProvider content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path,
-              ContentProviderWithoutLength content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers,
-              size_t content_length, ContentProvider content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Headers &headers,
-              ContentProviderWithoutLength content_provider,
-              const std::string &content_type);
-  Result Post(const std::string &path, const Params &params);
-  Result Post(const std::string &path, const Headers &headers,
-              const Params &params);
-  Result Post(const std::string &path, const MultipartFormDataItems &items);
-  Result Post(const std::string &path, const Headers &headers,
-              const MultipartFormDataItems &items);
-  Result Post(const std::string &path, const Headers &headers,
-              const MultipartFormDataItems &items, const std::string &boundary);
-  Result Post(const std::string &path, const Headers &headers,
-              const MultipartFormDataItems &items,
-              const MultipartFormDataProviderItems &provider_items);
-
-  Result Put(const std::string &path);
-  Result Put(const std::string &path, const char *body, size_t content_length,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers, const char *body,
-             size_t content_length, const std::string &content_type);
-  Result Put(const std::string &path, const std::string &body,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers,
-             const std::string &body, const std::string &content_type);
-  Result Put(const std::string &path, size_t content_length,
-             ContentProvider content_provider, const std::string &content_type);
-  Result Put(const std::string &path,
-             ContentProviderWithoutLength content_provider,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers,
-             size_t content_length, ContentProvider content_provider,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Headers &headers,
-             ContentProviderWithoutLength content_provider,
-             const std::string &content_type);
-  Result Put(const std::string &path, const Params &params);
-  Result Put(const std::string &path, const Headers &headers,
-             const Params &params);
-  Result Put(const std::string &path, const MultipartFormDataItems &items);
-  Result Put(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items);
-  Result Put(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items, const std::string &boundary);
-  Result Put(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items,
-             const MultipartFormDataProviderItems &provider_items);
-
-  Result Patch(const std::string &path);
-  Result Patch(const std::string &path, const char *body, size_t content_length,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               const char *body, size_t content_length,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const std::string &body,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               const std::string &body, const std::string &content_type);
-  Result Patch(const std::string &path, size_t content_length,
-               ContentProvider content_provider,
-               const std::string &content_type);
-  Result Patch(const std::string &path,
-               ContentProviderWithoutLength content_provider,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               size_t content_length, ContentProvider content_provider,
-               const std::string &content_type);
-  Result Patch(const std::string &path, const Headers &headers,
-               ContentProviderWithoutLength content_provider,
-               const std::string &content_type);
-
-  Result Delete(const std::string &path);
-  Result Delete(const std::string &path, const Headers &headers);
-  Result Delete(const std::string &path, const char *body,
-                size_t content_length, const std::string &content_type);
-  Result Delete(const std::string &path, const Headers &headers,
-                const char *body, size_t content_length,
-                const std::string &content_type);
-  Result Delete(const std::string &path, const std::string &body,
-                const std::string &content_type);
-  Result Delete(const std::string &path, const Headers &headers,
-                const std::string &body, const std::string &content_type);
-
-  Result Options(const std::string &path);
-  Result Options(const std::string &path, const Headers &headers);
-
-  bool send(Request &req, Response &res, Error &error);
-  Result send(const Request &req);
-
-  size_t is_socket_open() const;
-
-  socket_t socket() const;
-
-  void stop();
-
-  void set_hostname_addr_map(std::map<std::string, std::string> addr_map);
-
-  void set_default_headers(Headers headers);
-
-  void set_address_family(int family);
-  void set_tcp_nodelay(bool on);
-  void set_socket_options(SocketOptions socket_options);
-
-  void set_connection_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  void
-  set_connection_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_read_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  void set_read_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_write_timeout(time_t sec, time_t usec = 0);
-  template <class Rep, class Period>
-  void set_write_timeout(const std::chrono::duration<Rep, Period> &duration);
-
-  void set_basic_auth(const std::string &username, const std::string &password);
-  void set_bearer_token_auth(const std::string &token);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_digest_auth(const std::string &username,
-                       const std::string &password);
-#endif
-
-  void set_keep_alive(bool on);
-  void set_follow_location(bool on);
-
-  void set_url_encode(bool on);
-
-  void set_compress(bool on);
-
-  void set_decompress(bool on);
-
-  void set_interface(const std::string &intf);
-
-  void set_proxy(const std::string &host, int port);
-  void set_proxy_basic_auth(const std::string &username,
-                            const std::string &password);
-  void set_proxy_bearer_token_auth(const std::string &token);
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_proxy_digest_auth(const std::string &username,
-                             const std::string &password);
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void enable_server_certificate_verification(bool enabled);
-#endif
-
-  void set_logger(Logger logger);
-
-  // SSL
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  void set_ca_cert_path(const std::string &ca_cert_file_path,
-                        const std::string &ca_cert_dir_path = std::string());
-
-  void set_ca_cert_store(X509_STORE *ca_cert_store);
-
-  long get_openssl_verify_result() const;
-
-  SSL_CTX *ssl_context() const;
-#endif
-
-private:
-  std::unique_ptr<ClientImpl> cli_;
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  bool is_ssl_ = false;
-#endif
-};
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-class SSLServer : public Server {
-public:
-  SSLServer(const char *cert_path, const char *private_key_path,
-            const char *client_ca_cert_file_path = nullptr,
-            const char *client_ca_cert_dir_path = nullptr,
-            const char *private_key_password = nullptr);
-
-  SSLServer(X509 *cert, EVP_PKEY *private_key,
-            X509_STORE *client_ca_cert_store = nullptr);
-
-  SSLServer(
-      const std::function<bool(SSL_CTX &ssl_ctx)> &setup_ssl_ctx_callback);
-
-  ~SSLServer() override;
-
-  bool is_valid() const override;
-
-  SSL_CTX *ssl_context() const;
-
-private:
-  bool process_and_close_socket(socket_t sock) override;
-
-  SSL_CTX *ctx_;
-  std::mutex ctx_mutex_;
-};
-
-class SSLClient : public ClientImpl {
-public:
-  explicit SSLClient(const std::string &host);
-
-  explicit SSLClient(const std::string &host, int port);
-
-  explicit SSLClient(const std::string &host, int port,
-                     const std::string &client_cert_path,
-                     const std::string &client_key_path);
-
-  explicit SSLClient(const std::string &host, int port, X509 *client_cert,
-                     EVP_PKEY *client_key);
-
-  ~SSLClient() override;
-
-  bool is_valid() const override;
-
-  void set_ca_cert_store(X509_STORE *ca_cert_store);
-
-  long get_openssl_verify_result() const;
-
-  SSL_CTX *ssl_context() const;
-
-private:
-  bool create_and_connect_socket(Socket &socket, Error &error) override;
-  void shutdown_ssl(Socket &socket, bool shutdown_gracefully) override;
-  void shutdown_ssl_impl(Socket &socket, bool shutdown_socket);
-
-  bool process_socket(const Socket &socket,
-                      std::function<bool(Stream &strm)> callback) override;
-  bool is_ssl() const override;
-
-  bool connect_with_proxy(Socket &sock, Response &res, bool &success,
-                          Error &error);
-  bool initialize_ssl(Socket &socket, Error &error);
-
-  bool load_certs();
-
-  bool verify_host(X509 *server_cert) const;
-  bool verify_host_with_subject_alt_name(X509 *server_cert) const;
-  bool verify_host_with_common_name(X509 *server_cert) const;
-  bool check_host_name(const char *pattern, size_t pattern_len) const;
-
-  SSL_CTX *ctx_;
-  std::mutex ctx_mutex_;
-  std::once_flag initialize_cert_;
-
-  std::vector<std::string> host_components_;
-
-  long verify_result_ = 0;
-
-  friend class ClientImpl;
-};
-#endif
-
-/*
- * Implementation of template methods.
- */
-
-namespace detail {
-
-template <typename T, typename U>
-inline void duration_to_sec_and_usec(const T &duration, U callback) {
-  auto sec = std::chrono::duration_cast<std::chrono::seconds>(duration).count();
-  auto usec = std::chrono::duration_cast<std::chrono::microseconds>(
-                  duration - std::chrono::seconds(sec))
-                  .count();
-  callback(static_cast<time_t>(sec), static_cast<time_t>(usec));
-}
-
-template <typename T>
-inline T get_header_value(const Headers & /*headers*/,
-                          const std::string & /*key*/, size_t /*id*/ = 0,
-                          uint64_t /*def*/ = 0) {}
-
-template <>
-inline uint64_t get_header_value<uint64_t>(const Headers &headers,
-                                           const std::string &key, size_t id,
-                                           uint64_t def) {
-  auto rng = headers.equal_range(key);
-  auto it = rng.first;
-  std::advance(it, static_cast<ssize_t>(id));
-  if (it != rng.second) {
-    return std::strtoull(it->second.data(), nullptr, 10);
-  }
-  return def;
-}
-
-} // namespace detail
-
-template <typename T>
-inline T Request::get_header_value(const std::string &key, size_t id) const {
-  return detail::get_header_value<T>(headers, key, id, 0);
-}
-
-template <typename T>
-inline T Response::get_header_value(const std::string &key, size_t id) const {
-  return detail::get_header_value<T>(headers, key, id, 0);
-}
-
-template <typename... Args>
-inline ssize_t Stream::write_format(const char *fmt, const Args &...args) {
-  const auto bufsiz = 2048;
-  std::array<char, bufsiz> buf{};
-
-  auto sn = snprintf(buf.data(), buf.size() - 1, fmt, args...);
-  if (sn <= 0) { return sn; }
-
-  auto n = static_cast<size_t>(sn);
-
-  if (n >= buf.size() - 1) {
-    std::vector<char> glowable_buf(buf.size());
-
-    while (n >= glowable_buf.size() - 1) {
-      glowable_buf.resize(glowable_buf.size() * 2);
-      n = static_cast<size_t>(
-          snprintf(&glowable_buf[0], glowable_buf.size() - 1, fmt, args...));
-    }
-    return write(&glowable_buf[0], n);
-  } else {
-    return write(buf.data(), n);
-  }
-}
-
-inline void default_socket_options(socket_t sock) {
-  int yes = 1;
-#ifdef _WIN32
-  setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast<char *>(&yes),
-             sizeof(yes));
-  setsockopt(sock, SOL_SOCKET, SO_EXCLUSIVEADDRUSE,
-             reinterpret_cast<char *>(&yes), sizeof(yes));
-#else
-#ifdef SO_REUSEPORT
-  setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, reinterpret_cast<void *>(&yes),
-             sizeof(yes));
-#else
-  setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast<void *>(&yes),
-             sizeof(yes));
-#endif
-#endif
-}
-
-template <class Rep, class Period>
-inline Server &
-Server::set_read_timeout(const std::chrono::duration<Rep, Period> &duration) {
-  detail::duration_to_sec_and_usec(
-      duration, [&](time_t sec, time_t usec) { set_read_timeout(sec, usec); });
-  return *this;
-}
-
-template <class Rep, class Period>
-inline Server &
-Server::set_write_timeout(const std::chrono::duration<Rep, Period> &duration) {
-  detail::duration_to_sec_and_usec(
-      duration, [&](time_t sec, time_t usec) { set_write_timeout(sec, usec); });
-  return *this;
-}
-
-template <class Rep, class Period>
-inline Server &
-Server::set_idle_interval(const std::chrono::duration<Rep, Period> &duration) {
-  detail::duration_to_sec_and_usec(
-      duration, [&](time_t sec, time_t usec) { set_idle_interval(sec, usec); });
-  return *this;
-}
-
-inline std::string to_string(const Error error) {
-  switch (error) {
-  case Error::Success: return "Success (no error)";
-  case Error::Connection: return "Could not establish connection";
-  case Error::BindIPAddress: return "Failed to bind IP address";
-  case Error::Read: return "Failed to read connection";
-  case Error::Write: return "Failed to write connection";
-  case Error::ExceedRedirectCount: return "Maximum redirect count exceeded";
-  case Error::Canceled: return "Connection handling canceled";
-  case Error::SSLConnection: return "SSL connection failed";
-  case Error::SSLLoadingCerts: return "SSL certificate loading failed";
-  case Error::SSLServerVerification: return "SSL server verification failed";
-  case Error::UnsupportedMultipartBoundaryChars:
-    return "Unsupported HTTP multipart boundary characters";
-  case Error::Compression: return "Compression failed";
-  case Error::ConnectionTimeout: return "Connection timed out";
-  case Error::Unknown: return "Unknown";
-  default: break;
-  }
-
-  return "Invalid";
-}
-
-inline std::ostream &operator<<(std::ostream &os, const Error &obj) {
-  os << to_string(obj);
-  os << " (" << static_cast<std::underlying_type<Error>::type>(obj) << ')';
-  return os;
-}
-
-template <typename T>
-inline T Result::get_request_header_value(const std::string &key,
-                                          size_t id) const {
-  return detail::get_header_value<T>(request_headers_, key, id, 0);
-}
-
-template <class Rep, class Period>
-inline void ClientImpl::set_connection_timeout(
-    const std::chrono::duration<Rep, Period> &duration) {
-  detail::duration_to_sec_and_usec(duration, [&](time_t sec, time_t usec) {
-    set_connection_timeout(sec, usec);
-  });
-}
-
-template <class Rep, class Period>
-inline void ClientImpl::set_read_timeout(
-    const std::chrono::duration<Rep, Period> &duration) {
-  detail::duration_to_sec_and_usec(
-      duration, [&](time_t sec, time_t usec) { set_read_timeout(sec, usec); });
-}
-
-template <class Rep, class Period>
-inline void ClientImpl::set_write_timeout(
-    const std::chrono::duration<Rep, Period> &duration) {
-  detail::duration_to_sec_and_usec(
-      duration, [&](time_t sec, time_t usec) { set_write_timeout(sec, usec); });
-}
-
-template <class Rep, class Period>
-inline void Client::set_connection_timeout(
-    const std::chrono::duration<Rep, Period> &duration) {
-  cli_->set_connection_timeout(duration);
-}
-
-template <class Rep, class Period>
-inline void
-Client::set_read_timeout(const std::chrono::duration<Rep, Period> &duration) {
-  cli_->set_read_timeout(duration);
-}
-
-template <class Rep, class Period>
-inline void
-Client::set_write_timeout(const std::chrono::duration<Rep, Period> &duration) {
-  cli_->set_write_timeout(duration);
-}
-
-/*
- * Forward declarations and types that will be part of the .h file if split into
- * .h + .cc.
- */
-
-std::string hosted_at(const std::string &hostname);
-
-void hosted_at(const std::string &hostname, std::vector<std::string> &addrs);
-
-std::string append_query_params(const std::string &path, const Params &params);
-
-std::pair<std::string, std::string> make_range_header(Ranges ranges);
-
-std::pair<std::string, std::string>
-make_basic_authentication_header(const std::string &username,
-                                 const std::string &password,
-                                 bool is_proxy = false);
-
-namespace detail {
-
-std::string encode_query_param(const std::string &value);
-
-std::string decode_url(const std::string &s, bool convert_plus_to_space);
-
-void read_file(const std::string &path, std::string &out);
-
-std::string trim_copy(const std::string &s);
-
-void split(const char *b, const char *e, char d,
-           std::function<void(const char *, const char *)> fn);
-
-bool process_client_socket(socket_t sock, time_t read_timeout_sec,
-                           time_t read_timeout_usec, time_t write_timeout_sec,
-                           time_t write_timeout_usec,
-                           std::function<bool(Stream &)> callback);
-
-socket_t create_client_socket(
-    const std::string &host, const std::string &ip, int port,
-    int address_family, bool tcp_nodelay, SocketOptions socket_options,
-    time_t connection_timeout_sec, time_t connection_timeout_usec,
-    time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec,
-    time_t write_timeout_usec, const std::string &intf, Error &error);
-
-const char *get_header_value(const Headers &headers, const std::string &key,
-                             size_t id = 0, const char *def = nullptr);
-
-std::string params_to_query_str(const Params &params);
-
-void parse_query_text(const std::string &s, Params &params);
-
-bool parse_multipart_boundary(const std::string &content_type,
-                              std::string &boundary);
-
-bool parse_range_header(const std::string &s, Ranges &ranges);
-
-int close_socket(socket_t sock);
-
-ssize_t send_socket(socket_t sock, const void *ptr, size_t size, int flags);
-
-ssize_t read_socket(socket_t sock, void *ptr, size_t size, int flags);
-
-enum class EncodingType { None = 0, Gzip, Brotli };
-
-EncodingType encoding_type(const Request &req, const Response &res);
-
-class BufferStream : public Stream {
-public:
-  BufferStream() = default;
-  ~BufferStream() override = default;
-
-  bool is_readable() const override;
-  bool is_writable() const override;
-  ssize_t read(char *ptr, size_t size) override;
-  ssize_t write(const char *ptr, size_t size) override;
-  void get_remote_ip_and_port(std::string &ip, int &port) const override;
-  void get_local_ip_and_port(std::string &ip, int &port) const override;
-  socket_t socket() const override;
-
-  const std::string &get_buffer() const;
-
-private:
-  std::string buffer;
-  size_t position = 0;
-};
-
-class compressor {
-public:
-  virtual ~compressor() = default;
-
-  typedef std::function<bool(const char *data, size_t data_len)> Callback;
-  virtual bool compress(const char *data, size_t data_length, bool last,
-                        Callback callback) = 0;
-};
-
-class decompressor {
-public:
-  virtual ~decompressor() = default;
-
-  virtual bool is_valid() const = 0;
-
-  typedef std::function<bool(const char *data, size_t data_len)> Callback;
-  virtual bool decompress(const char *data, size_t data_length,
-                          Callback callback) = 0;
-};
-
-class nocompressor : public compressor {
-public:
-  virtual ~nocompressor() = default;
-
-  bool compress(const char *data, size_t data_length, bool /*last*/,
-                Callback callback) override;
-};
-
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-class gzip_compressor : public compressor {
-public:
-  gzip_compressor();
-  ~gzip_compressor();
-
-  bool compress(const char *data, size_t data_length, bool last,
-                Callback callback) override;
-
-private:
-  bool is_valid_ = false;
-  z_stream strm_;
-};
-
-class gzip_decompressor : public decompressor {
-public:
-  gzip_decompressor();
-  ~gzip_decompressor();
-
-  bool is_valid() const override;
-
-  bool decompress(const char *data, size_t data_length,
-                  Callback callback) override;
-
-private:
-  bool is_valid_ = false;
-  z_stream strm_;
-};
-#endif
-
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-class brotli_compressor : public compressor {
-public:
-  brotli_compressor();
-  ~brotli_compressor();
-
-  bool compress(const char *data, size_t data_length, bool last,
-                Callback callback) override;
-
-private:
-  BrotliEncoderState *state_ = nullptr;
-};
-
-class brotli_decompressor : public decompressor {
-public:
-  brotli_decompressor();
-  ~brotli_decompressor();
-
-  bool is_valid() const override;
-
-  bool decompress(const char *data, size_t data_length,
-                  Callback callback) override;
-
-private:
-  BrotliDecoderResult decoder_r;
-  BrotliDecoderState *decoder_s = nullptr;
-};
-#endif
-
-// NOTE: until the read size reaches `fixed_buffer_size`, use `fixed_buffer`
-// to store data. The call can set memory on stack for performance.
-class stream_line_reader {
-public:
-  stream_line_reader(Stream &strm, char *fixed_buffer,
-                     size_t fixed_buffer_size);
-  const char *ptr() const;
-  size_t size() const;
-  bool end_with_crlf() const;
-  bool getline();
-
-private:
-  void append(char c);
-
-  Stream &strm_;
-  char *fixed_buffer_;
-  const size_t fixed_buffer_size_;
-  size_t fixed_buffer_used_size_ = 0;
-  std::string glowable_buffer_;
-};
-
-} // namespace detail
-
-// ----------------------------------------------------------------------------
-
-/*
- * Implementation that will be part of the .cc file if split into .h + .cc.
- */
-
-namespace detail {
-
-inline bool is_hex(char c, int &v) {
-  if (0x20 <= c && isdigit(c)) {
-    v = c - '0';
-    return true;
-  } else if ('A' <= c && c <= 'F') {
-    v = c - 'A' + 10;
-    return true;
-  } else if ('a' <= c && c <= 'f') {
-    v = c - 'a' + 10;
-    return true;
-  }
-  return false;
-}
-
-inline bool from_hex_to_i(const std::string &s, size_t i, size_t cnt,
-                          int &val) {
-  if (i >= s.size()) { return false; }
-
-  val = 0;
-  for (; cnt; i++, cnt--) {
-    if (!s[i]) { return false; }
-    int v = 0;
-    if (is_hex(s[i], v)) {
-      val = val * 16 + v;
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-inline std::string from_i_to_hex(size_t n) {
-  const char *charset = "0123456789abcdef";
-  std::string ret;
-  do {
-    ret = charset[n & 15] + ret;
-    n >>= 4;
-  } while (n > 0);
-  return ret;
-}
-
-inline size_t to_utf8(int code, char *buff) {
-  if (code < 0x0080) {
-    buff[0] = (code & 0x7F);
-    return 1;
-  } else if (code < 0x0800) {
-    buff[0] = static_cast<char>(0xC0 | ((code >> 6) & 0x1F));
-    buff[1] = static_cast<char>(0x80 | (code & 0x3F));
-    return 2;
-  } else if (code < 0xD800) {
-    buff[0] = static_cast<char>(0xE0 | ((code >> 12) & 0xF));
-    buff[1] = static_cast<char>(0x80 | ((code >> 6) & 0x3F));
-    buff[2] = static_cast<char>(0x80 | (code & 0x3F));
-    return 3;
-  } else if (code < 0xE000) { // D800 - DFFF is invalid...
-    return 0;
-  } else if (code < 0x10000) {
-    buff[0] = static_cast<char>(0xE0 | ((code >> 12) & 0xF));
-    buff[1] = static_cast<char>(0x80 | ((code >> 6) & 0x3F));
-    buff[2] = static_cast<char>(0x80 | (code & 0x3F));
-    return 3;
-  } else if (code < 0x110000) {
-    buff[0] = static_cast<char>(0xF0 | ((code >> 18) & 0x7));
-    buff[1] = static_cast<char>(0x80 | ((code >> 12) & 0x3F));
-    buff[2] = static_cast<char>(0x80 | ((code >> 6) & 0x3F));
-    buff[3] = static_cast<char>(0x80 | (code & 0x3F));
-    return 4;
-  }
-
-  // NOTREACHED
-  return 0;
-}
-
-// NOTE: This code came up with the following stackoverflow post:
-// https://stackoverflow.com/questions/180947/base64-decode-snippet-in-c
-inline std::string base64_encode(const std::string &in) {
-  static const auto lookup =
-      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-  std::string out;
-  out.reserve(in.size());
-
-  int val = 0;
-  int valb = -6;
-
-  for (auto c : in) {
-    val = (val << 8) + static_cast<uint8_t>(c);
-    valb += 8;
-    while (valb >= 0) {
-      out.push_back(lookup[(val >> valb) & 0x3F]);
-      valb -= 6;
-    }
-  }
-
-  if (valb > -6) { out.push_back(lookup[((val << 8) >> (valb + 8)) & 0x3F]); }
-
-  while (out.size() % 4) {
-    out.push_back('=');
-  }
-
-  return out;
-}
-
-inline bool is_file(const std::string &path) {
-#ifdef _WIN32
-  return _access_s(path.c_str(), 0) == 0;
-#else
-  struct stat st;
-  return stat(path.c_str(), &st) >= 0 && S_ISREG(st.st_mode);
-#endif
-}
-
-inline bool is_dir(const std::string &path) {
-  struct stat st;
-  return stat(path.c_str(), &st) >= 0 && S_ISDIR(st.st_mode);
-}
-
-inline bool is_valid_path(const std::string &path) {
-  size_t level = 0;
-  size_t i = 0;
-
-  // Skip slash
-  while (i < path.size() && path[i] == '/') {
-    i++;
-  }
-
-  while (i < path.size()) {
-    // Read component
-    auto beg = i;
-    while (i < path.size() && path[i] != '/') {
-      i++;
-    }
-
-    auto len = i - beg;
-    assert(len > 0);
-
-    if (!path.compare(beg, len, ".")) {
-      ;
-    } else if (!path.compare(beg, len, "..")) {
-      if (level == 0) { return false; }
-      level--;
-    } else {
-      level++;
-    }
-
-    // Skip slash
-    while (i < path.size() && path[i] == '/') {
-      i++;
-    }
-  }
-
-  return true;
-}
-
-inline std::string encode_query_param(const std::string &value) {
-  std::ostringstream escaped;
-  escaped.fill('0');
-  escaped << std::hex;
-
-  for (auto c : value) {
-    if (std::isalnum(static_cast<uint8_t>(c)) || c == '-' || c == '_' ||
-        c == '.' || c == '!' || c == '~' || c == '*' || c == '\'' || c == '(' ||
-        c == ')') {
-      escaped << c;
-    } else {
-      escaped << std::uppercase;
-      escaped << '%' << std::setw(2)
-              << static_cast<int>(static_cast<unsigned char>(c));
-      escaped << std::nouppercase;
-    }
-  }
-
-  return escaped.str();
-}
-
-inline std::string encode_url(const std::string &s) {
-  std::string result;
-  result.reserve(s.size());
-
-  for (size_t i = 0; s[i]; i++) {
-    switch (s[i]) {
-    case ' ': result += "%20"; break;
-    case '+': result += "%2B"; break;
-    case '\r': result += "%0D"; break;
-    case '\n': result += "%0A"; break;
-    case '\'': result += "%27"; break;
-    case ',': result += "%2C"; break;
-    // case ':': result += "%3A"; break; // ok? probably...
-    case ';': result += "%3B"; break;
-    default:
-      auto c = static_cast<uint8_t>(s[i]);
-      if (c >= 0x80) {
-        result += '%';
-        char hex[4];
-        auto len = snprintf(hex, sizeof(hex) - 1, "%02X", c);
-        assert(len == 2);
-        result.append(hex, static_cast<size_t>(len));
-      } else {
-        result += s[i];
-      }
-      break;
-    }
-  }
-
-  return result;
-}
-
-inline std::string decode_url(const std::string &s,
-                              bool convert_plus_to_space) {
-  std::string result;
-
-  for (size_t i = 0; i < s.size(); i++) {
-    if (s[i] == '%' && i + 1 < s.size()) {
-      if (s[i + 1] == 'u') {
-        int val = 0;
-        if (from_hex_to_i(s, i + 2, 4, val)) {
-          // 4 digits Unicode codes
-          char buff[4];
-          size_t len = to_utf8(val, buff);
-          if (len > 0) { result.append(buff, len); }
-          i += 5; // 'u0000'
-        } else {
-          result += s[i];
-        }
-      } else {
-        int val = 0;
-        if (from_hex_to_i(s, i + 1, 2, val)) {
-          // 2 digits hex codes
-          result += static_cast<char>(val);
-          i += 2; // '00'
-        } else {
-          result += s[i];
-        }
-      }
-    } else if (convert_plus_to_space && s[i] == '+') {
-      result += ' ';
-    } else {
-      result += s[i];
-    }
-  }
-
-  return result;
-}
-
-inline void read_file(const std::string &path, std::string &out) {
-  std::ifstream fs(path, std::ios_base::binary);
-  fs.seekg(0, std::ios_base::end);
-  auto size = fs.tellg();
-  fs.seekg(0);
-  out.resize(static_cast<size_t>(size));
-  fs.read(&out[0], static_cast<std::streamsize>(size));
-}
-
-inline std::string file_extension(const std::string &path) {
-  std::smatch m;
-  static auto re = std::regex("\\.([a-zA-Z0-9]+)$");
-  if (std::regex_search(path, m, re)) { return m[1].str(); }
-  return std::string();
-}
-
-inline bool is_space_or_tab(char c) { return c == ' ' || c == '\t'; }
-
-inline std::pair<size_t, size_t> trim(const char *b, const char *e, size_t left,
-                                      size_t right) {
-  while (b + left < e && is_space_or_tab(b[left])) {
-    left++;
-  }
-  while (right > 0 && is_space_or_tab(b[right - 1])) {
-    right--;
-  }
-  return std::make_pair(left, right);
-}
-
-inline std::string trim_copy(const std::string &s) {
-  auto r = trim(s.data(), s.data() + s.size(), 0, s.size());
-  return s.substr(r.first, r.second - r.first);
-}
-
-inline void split(const char *b, const char *e, char d,
-                  std::function<void(const char *, const char *)> fn) {
-  size_t i = 0;
-  size_t beg = 0;
-
-  while (e ? (b + i < e) : (b[i] != '\0')) {
-    if (b[i] == d) {
-      auto r = trim(b, e, beg, i);
-      if (r.first < r.second) { fn(&b[r.first], &b[r.second]); }
-      beg = i + 1;
-    }
-    i++;
-  }
-
-  if (i) {
-    auto r = trim(b, e, beg, i);
-    if (r.first < r.second) { fn(&b[r.first], &b[r.second]); }
-  }
-}
-
-inline stream_line_reader::stream_line_reader(Stream &strm, char *fixed_buffer,
-                                              size_t fixed_buffer_size)
-    : strm_(strm), fixed_buffer_(fixed_buffer),
-      fixed_buffer_size_(fixed_buffer_size) {}
-
-inline const char *stream_line_reader::ptr() const {
-  if (glowable_buffer_.empty()) {
-    return fixed_buffer_;
-  } else {
-    return glowable_buffer_.data();
-  }
-}
-
-inline size_t stream_line_reader::size() const {
-  if (glowable_buffer_.empty()) {
-    return fixed_buffer_used_size_;
-  } else {
-    return glowable_buffer_.size();
-  }
-}
-
-inline bool stream_line_reader::end_with_crlf() const {
-  auto end = ptr() + size();
-  return size() >= 2 && end[-2] == '\r' && end[-1] == '\n';
-}
-
-inline bool stream_line_reader::getline() {
-  fixed_buffer_used_size_ = 0;
-  glowable_buffer_.clear();
-
-  for (size_t i = 0;; i++) {
-    char byte;
-    auto n = strm_.read(&byte, 1);
-
-    if (n < 0) {
-      return false;
-    } else if (n == 0) {
-      if (i == 0) {
-        return false;
-      } else {
-        break;
-      }
-    }
-
-    append(byte);
-
-    if (byte == '\n') { break; }
-  }
-
-  return true;
-}
-
-inline void stream_line_reader::append(char c) {
-  if (fixed_buffer_used_size_ < fixed_buffer_size_ - 1) {
-    fixed_buffer_[fixed_buffer_used_size_++] = c;
-    fixed_buffer_[fixed_buffer_used_size_] = '\0';
-  } else {
-    if (glowable_buffer_.empty()) {
-      assert(fixed_buffer_[fixed_buffer_used_size_] == '\0');
-      glowable_buffer_.assign(fixed_buffer_, fixed_buffer_used_size_);
-    }
-    glowable_buffer_ += c;
-  }
-}
-
-inline int close_socket(socket_t sock) {
-#ifdef _WIN32
-  return closesocket(sock);
-#else
-  return close(sock);
-#endif
-}
-
-template <typename T> inline ssize_t handle_EINTR(T fn) {
-  ssize_t res = false;
-  while (true) {
-    res = fn();
-    if (res < 0 && errno == EINTR) { continue; }
-    break;
-  }
-  return res;
-}
-
-inline ssize_t read_socket(socket_t sock, void *ptr, size_t size, int flags) {
-  return handle_EINTR([&]() {
-    return recv(sock,
-#ifdef _WIN32
-                static_cast<char *>(ptr), static_cast<int>(size),
-#else
-                ptr, size,
-#endif
-                flags);
-  });
-}
-
-inline ssize_t send_socket(socket_t sock, const void *ptr, size_t size,
-                           int flags) {
-  return handle_EINTR([&]() {
-    return send(sock,
-#ifdef _WIN32
-                static_cast<const char *>(ptr), static_cast<int>(size),
-#else
-                ptr, size,
-#endif
-                flags);
-  });
-}
-
-inline ssize_t select_read(socket_t sock, time_t sec, time_t usec) {
-#ifdef CPPHTTPLIB_USE_POLL
-  struct pollfd pfd_read;
-  pfd_read.fd = sock;
-  pfd_read.events = POLLIN;
-
-  auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
-
-  return handle_EINTR([&]() { return poll(&pfd_read, 1, timeout); });
-#else
-#ifndef _WIN32
-  if (sock >= FD_SETSIZE) { return 1; }
-#endif
-
-  fd_set fds;
-  FD_ZERO(&fds);
-  FD_SET(sock, &fds);
-
-  timeval tv;
-  tv.tv_sec = static_cast<long>(sec);
-  tv.tv_usec = static_cast<decltype(tv.tv_usec)>(usec);
-
-  return handle_EINTR([&]() {
-    return select(static_cast<int>(sock + 1), &fds, nullptr, nullptr, &tv);
-  });
-#endif
-}
-
-inline ssize_t select_write(socket_t sock, time_t sec, time_t usec) {
-#ifdef CPPHTTPLIB_USE_POLL
-  struct pollfd pfd_read;
-  pfd_read.fd = sock;
-  pfd_read.events = POLLOUT;
-
-  auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
-
-  return handle_EINTR([&]() { return poll(&pfd_read, 1, timeout); });
-#else
-#ifndef _WIN32
-  if (sock >= FD_SETSIZE) { return 1; }
-#endif
-
-  fd_set fds;
-  FD_ZERO(&fds);
-  FD_SET(sock, &fds);
-
-  timeval tv;
-  tv.tv_sec = static_cast<long>(sec);
-  tv.tv_usec = static_cast<decltype(tv.tv_usec)>(usec);
-
-  return handle_EINTR([&]() {
-    return select(static_cast<int>(sock + 1), nullptr, &fds, nullptr, &tv);
-  });
-#endif
-}
-
-inline Error wait_until_socket_is_ready(socket_t sock, time_t sec,
-                                        time_t usec) {
-#ifdef CPPHTTPLIB_USE_POLL
-  struct pollfd pfd_read;
-  pfd_read.fd = sock;
-  pfd_read.events = POLLIN | POLLOUT;
-
-  auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
-
-  auto poll_res = handle_EINTR([&]() { return poll(&pfd_read, 1, timeout); });
-
-  if (poll_res == 0) { return Error::ConnectionTimeout; }
-
-  if (poll_res > 0 && pfd_read.revents & (POLLIN | POLLOUT)) {
-    int error = 0;
-    socklen_t len = sizeof(error);
-    auto res = getsockopt(sock, SOL_SOCKET, SO_ERROR,
-                          reinterpret_cast<char *>(&error), &len);
-    auto successful = res >= 0 && !error;
-    return successful ? Error::Success : Error::Connection;
-  }
-
-  return Error::Connection;
-#else
-#ifndef _WIN32
-  if (sock >= FD_SETSIZE) { return Error::Connection; }
-#endif
-
-  fd_set fdsr;
-  FD_ZERO(&fdsr);
-  FD_SET(sock, &fdsr);
-
-  auto fdsw = fdsr;
-  auto fdse = fdsr;
-
-  timeval tv;
-  tv.tv_sec = static_cast<long>(sec);
-  tv.tv_usec = static_cast<decltype(tv.tv_usec)>(usec);
-
-  auto ret = handle_EINTR([&]() {
-    return select(static_cast<int>(sock + 1), &fdsr, &fdsw, &fdse, &tv);
-  });
-
-  if (ret == 0) { return Error::ConnectionTimeout; }
-
-  if (ret > 0 && (FD_ISSET(sock, &fdsr) || FD_ISSET(sock, &fdsw))) {
-    int error = 0;
-    socklen_t len = sizeof(error);
-    auto res = getsockopt(sock, SOL_SOCKET, SO_ERROR,
-                          reinterpret_cast<char *>(&error), &len);
-    auto successful = res >= 0 && !error;
-    return successful ? Error::Success : Error::Connection;
-  }
-  return Error::Connection;
-#endif
-}
-
-inline bool is_socket_alive(socket_t sock) {
-  const auto val = detail::select_read(sock, 0, 0);
-  if (val == 0) {
-    return true;
-  } else if (val < 0 && errno == EBADF) {
-    return false;
-  }
-  char buf[1];
-  return detail::read_socket(sock, &buf[0], sizeof(buf), MSG_PEEK) > 0;
-}
-
-class SocketStream : public Stream {
-public:
-  SocketStream(socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec,
-               time_t write_timeout_sec, time_t write_timeout_usec);
-  ~SocketStream() override;
-
-  bool is_readable() const override;
-  bool is_writable() const override;
-  ssize_t read(char *ptr, size_t size) override;
-  ssize_t write(const char *ptr, size_t size) override;
-  void get_remote_ip_and_port(std::string &ip, int &port) const override;
-  void get_local_ip_and_port(std::string &ip, int &port) const override;
-  socket_t socket() const override;
-
-private:
-  socket_t sock_;
-  time_t read_timeout_sec_;
-  time_t read_timeout_usec_;
-  time_t write_timeout_sec_;
-  time_t write_timeout_usec_;
-
-  std::vector<char> read_buff_;
-  size_t read_buff_off_ = 0;
-  size_t read_buff_content_size_ = 0;
-
-  static const size_t read_buff_size_ = 1024 * 4;
-};
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-class SSLSocketStream : public Stream {
-public:
-  SSLSocketStream(socket_t sock, SSL *ssl, time_t read_timeout_sec,
-                  time_t read_timeout_usec, time_t write_timeout_sec,
-                  time_t write_timeout_usec);
-  ~SSLSocketStream() override;
-
-  bool is_readable() const override;
-  bool is_writable() const override;
-  ssize_t read(char *ptr, size_t size) override;
-  ssize_t write(const char *ptr, size_t size) override;
-  void get_remote_ip_and_port(std::string &ip, int &port) const override;
-  void get_local_ip_and_port(std::string &ip, int &port) const override;
-  socket_t socket() const override;
-
-private:
-  socket_t sock_;
-  SSL *ssl_;
-  time_t read_timeout_sec_;
-  time_t read_timeout_usec_;
-  time_t write_timeout_sec_;
-  time_t write_timeout_usec_;
-};
-#endif
-
-inline bool keep_alive(socket_t sock, time_t keep_alive_timeout_sec) {
-  using namespace std::chrono;
-  auto start = steady_clock::now();
-  while (true) {
-    auto val = select_read(sock, 0, 10000);
-    if (val < 0) {
-      return false;
-    } else if (val == 0) {
-      auto current = steady_clock::now();
-      auto duration = duration_cast<milliseconds>(current - start);
-      auto timeout = keep_alive_timeout_sec * 1000;
-      if (duration.count() > timeout) { return false; }
-      std::this_thread::sleep_for(std::chrono::milliseconds(1));
-    } else {
-      return true;
-    }
-  }
-}
-
-template <typename T>
-inline bool
-process_server_socket_core(const std::atomic<socket_t> &svr_sock, socket_t sock,
-                           size_t keep_alive_max_count,
-                           time_t keep_alive_timeout_sec, T callback) {
-  assert(keep_alive_max_count > 0);
-  auto ret = false;
-  auto count = keep_alive_max_count;
-  while (svr_sock != INVALID_SOCKET && count > 0 &&
-         keep_alive(sock, keep_alive_timeout_sec)) {
-    auto close_connection = count == 1;
-    auto connection_closed = false;
-    ret = callback(close_connection, connection_closed);
-    if (!ret || connection_closed) { break; }
-    count--;
-  }
-  return ret;
-}
-
-template <typename T>
-inline bool
-process_server_socket(const std::atomic<socket_t> &svr_sock, socket_t sock,
-                      size_t keep_alive_max_count,
-                      time_t keep_alive_timeout_sec, time_t read_timeout_sec,
-                      time_t read_timeout_usec, time_t write_timeout_sec,
-                      time_t write_timeout_usec, T callback) {
-  return process_server_socket_core(
-      svr_sock, sock, keep_alive_max_count, keep_alive_timeout_sec,
-      [&](bool close_connection, bool &connection_closed) {
-        SocketStream strm(sock, read_timeout_sec, read_timeout_usec,
-                          write_timeout_sec, write_timeout_usec);
-        return callback(strm, close_connection, connection_closed);
-      });
-}
-
-inline bool process_client_socket(socket_t sock, time_t read_timeout_sec,
-                                  time_t read_timeout_usec,
-                                  time_t write_timeout_sec,
-                                  time_t write_timeout_usec,
-                                  std::function<bool(Stream &)> callback) {
-  SocketStream strm(sock, read_timeout_sec, read_timeout_usec,
-                    write_timeout_sec, write_timeout_usec);
-  return callback(strm);
-}
-
-inline int shutdown_socket(socket_t sock) {
-#ifdef _WIN32
-  return shutdown(sock, SD_BOTH);
-#else
-  return shutdown(sock, SHUT_RDWR);
-#endif
-}
-
-template <typename BindOrConnect>
-socket_t create_socket(const std::string &host, const std::string &ip, int port,
-                       int address_family, int socket_flags, bool tcp_nodelay,
-                       SocketOptions socket_options,
-                       BindOrConnect bind_or_connect) {
-  // Get address info
-  const char *node = nullptr;
-  struct addrinfo hints;
-  struct addrinfo *result;
-
-  memset(&hints, 0, sizeof(struct addrinfo));
-  hints.ai_socktype = SOCK_STREAM;
-  hints.ai_protocol = 0;
-
-  if (!ip.empty()) {
-    node = ip.c_str();
-    // Ask getaddrinfo to convert IP in c-string to address
-    hints.ai_family = AF_UNSPEC;
-    hints.ai_flags = AI_NUMERICHOST;
-  } else {
-    if (!host.empty()) { node = host.c_str(); }
-    hints.ai_family = address_family;
-    hints.ai_flags = socket_flags;
-  }
-
-#ifndef _WIN32
-  if (hints.ai_family == AF_UNIX) {
-    const auto addrlen = host.length();
-    if (addrlen > sizeof(sockaddr_un::sun_path)) return INVALID_SOCKET;
-
-    auto sock = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol);
-    if (sock != INVALID_SOCKET) {
-      sockaddr_un addr{};
-      addr.sun_family = AF_UNIX;
-      std::copy(host.begin(), host.end(), addr.sun_path);
-
-      hints.ai_addr = reinterpret_cast<sockaddr *>(&addr);
-      hints.ai_addrlen = static_cast<socklen_t>(
-          sizeof(addr) - sizeof(addr.sun_path) + addrlen);
-
-      fcntl(sock, F_SETFD, FD_CLOEXEC);
-      if (socket_options) { socket_options(sock); }
-
-      if (!bind_or_connect(sock, hints)) {
-        close_socket(sock);
-        sock = INVALID_SOCKET;
-      }
-    }
-    return sock;
-  }
-#endif
-
-  auto service = std::to_string(port);
-
-  if (getaddrinfo(node, service.c_str(), &hints, &result)) {
-#if defined __linux__ && !defined __ANDROID__
-    res_init();
-#endif
-    return INVALID_SOCKET;
-  }
-
-  for (auto rp = result; rp; rp = rp->ai_next) {
-    // Create a socket
-#ifdef _WIN32
-    auto sock =
-        WSASocketW(rp->ai_family, rp->ai_socktype, rp->ai_protocol, nullptr, 0,
-                   WSA_FLAG_NO_HANDLE_INHERIT | WSA_FLAG_OVERLAPPED);
-    /**
-     * Since the WSA_FLAG_NO_HANDLE_INHERIT is only supported on Windows 7 SP1
-     * and above the socket creation fails on older Windows Systems.
-     *
-     * Let's try to create a socket the old way in this case.
-     *
-     * Reference:
-     * https://docs.microsoft.com/en-us/windows/win32/api/winsock2/nf-winsock2-wsasocketa
-     *
-     * WSA_FLAG_NO_HANDLE_INHERIT:
-     * This flag is supported on Windows 7 with SP1, Windows Server 2008 R2 with
-     * SP1, and later
-     *
-     */
-    if (sock == INVALID_SOCKET) {
-      sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
-    }
-#else
-    auto sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
-#endif
-    if (sock == INVALID_SOCKET) { continue; }
-
-#ifndef _WIN32
-    if (fcntl(sock, F_SETFD, FD_CLOEXEC) == -1) {
-      close_socket(sock);
-      continue;
-    }
-#endif
-
-    if (tcp_nodelay) {
-      int yes = 1;
-      setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast<char *>(&yes),
-                 sizeof(yes));
-    }
-
-    if (socket_options) { socket_options(sock); }
-
-    if (rp->ai_family == AF_INET6) {
-      int no = 0;
-      setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, reinterpret_cast<char *>(&no),
-                 sizeof(no));
-    }
-
-    // bind or connect
-    if (bind_or_connect(sock, *rp)) {
-      freeaddrinfo(result);
-      return sock;
-    }
-
-    close_socket(sock);
-  }
-
-  freeaddrinfo(result);
-  return INVALID_SOCKET;
-}
-
-inline void set_nonblocking(socket_t sock, bool nonblocking) {
-#ifdef _WIN32
-  auto flags = nonblocking ? 1UL : 0UL;
-  ioctlsocket(sock, FIONBIO, &flags);
-#else
-  auto flags = fcntl(sock, F_GETFL, 0);
-  fcntl(sock, F_SETFL,
-        nonblocking ? (flags | O_NONBLOCK) : (flags & (~O_NONBLOCK)));
-#endif
-}
-
-inline bool is_connection_error() {
-#ifdef _WIN32
-  return WSAGetLastError() != WSAEWOULDBLOCK;
-#else
-  return errno != EINPROGRESS;
-#endif
-}
-
-inline bool bind_ip_address(socket_t sock, const std::string &host) {
-  struct addrinfo hints;
-  struct addrinfo *result;
-
-  memset(&hints, 0, sizeof(struct addrinfo));
-  hints.ai_family = AF_UNSPEC;
-  hints.ai_socktype = SOCK_STREAM;
-  hints.ai_protocol = 0;
-
-  if (getaddrinfo(host.c_str(), "0", &hints, &result)) { return false; }
-
-  auto ret = false;
-  for (auto rp = result; rp; rp = rp->ai_next) {
-    const auto &ai = *rp;
-    if (!::bind(sock, ai.ai_addr, static_cast<socklen_t>(ai.ai_addrlen))) {
-      ret = true;
-      break;
-    }
-  }
-
-  freeaddrinfo(result);
-  return ret;
-}
-
-#if !defined _WIN32 && !defined ANDROID && !defined _AIX
-#define USE_IF2IP
-#endif
-
-#ifdef USE_IF2IP
-inline std::string if2ip(int address_family, const std::string &ifn) {
-  struct ifaddrs *ifap;
-  getifaddrs(&ifap);
-  std::string addr_candidate;
-  for (auto ifa = ifap; ifa; ifa = ifa->ifa_next) {
-    if (ifa->ifa_addr && ifn == ifa->ifa_name &&
-        (AF_UNSPEC == address_family ||
-         ifa->ifa_addr->sa_family == address_family)) {
-      if (ifa->ifa_addr->sa_family == AF_INET) {
-        auto sa = reinterpret_cast<struct sockaddr_in *>(ifa->ifa_addr);
-        char buf[INET_ADDRSTRLEN];
-        if (inet_ntop(AF_INET, &sa->sin_addr, buf, INET_ADDRSTRLEN)) {
-          freeifaddrs(ifap);
-          return std::string(buf, INET_ADDRSTRLEN);
-        }
-      } else if (ifa->ifa_addr->sa_family == AF_INET6) {
-        auto sa = reinterpret_cast<struct sockaddr_in6 *>(ifa->ifa_addr);
-        if (!IN6_IS_ADDR_LINKLOCAL(&sa->sin6_addr)) {
-          char buf[INET6_ADDRSTRLEN] = {};
-          if (inet_ntop(AF_INET6, &sa->sin6_addr, buf, INET6_ADDRSTRLEN)) {
-            // equivalent to mac's IN6_IS_ADDR_UNIQUE_LOCAL
-            auto s6_addr_head = sa->sin6_addr.s6_addr[0];
-            if (s6_addr_head == 0xfc || s6_addr_head == 0xfd) {
-              addr_candidate = std::string(buf, INET6_ADDRSTRLEN);
-            } else {
-              freeifaddrs(ifap);
-              return std::string(buf, INET6_ADDRSTRLEN);
-            }
-          }
-        }
-      }
-    }
-  }
-  freeifaddrs(ifap);
-  return addr_candidate;
-}
-#endif
-
-inline socket_t create_client_socket(
-    const std::string &host, const std::string &ip, int port,
-    int address_family, bool tcp_nodelay, SocketOptions socket_options,
-    time_t connection_timeout_sec, time_t connection_timeout_usec,
-    time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec,
-    time_t write_timeout_usec, const std::string &intf, Error &error) {
-  auto sock = create_socket(
-      host, ip, port, address_family, 0, tcp_nodelay, std::move(socket_options),
-      [&](socket_t sock2, struct addrinfo &ai) -> bool {
-        if (!intf.empty()) {
-#ifdef USE_IF2IP
-          auto ip_from_if = if2ip(address_family, intf);
-          if (ip_from_if.empty()) { ip_from_if = intf; }
-          if (!bind_ip_address(sock2, ip_from_if.c_str())) {
-            error = Error::BindIPAddress;
-            return false;
-          }
-#endif
-        }
-
-        set_nonblocking(sock2, true);
-
-        auto ret =
-            ::connect(sock2, ai.ai_addr, static_cast<socklen_t>(ai.ai_addrlen));
-
-        if (ret < 0) {
-          if (is_connection_error()) {
-            error = Error::Connection;
-            return false;
-          }
-          error = wait_until_socket_is_ready(sock2, connection_timeout_sec,
-                                             connection_timeout_usec);
-          if (error != Error::Success) { return false; }
-        }
-
-        set_nonblocking(sock2, false);
-
-        {
-#ifdef _WIN32
-          auto timeout = static_cast<uint32_t>(read_timeout_sec * 1000 +
-                                               read_timeout_usec / 1000);
-          setsockopt(sock2, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout,
-                     sizeof(timeout));
-#else
-          timeval tv;
-          tv.tv_sec = static_cast<long>(read_timeout_sec);
-          tv.tv_usec = static_cast<decltype(tv.tv_usec)>(read_timeout_usec);
-          setsockopt(sock2, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv));
-#endif
-        }
-        {
-
-#ifdef _WIN32
-          auto timeout = static_cast<uint32_t>(write_timeout_sec * 1000 +
-                                               write_timeout_usec / 1000);
-          setsockopt(sock2, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout,
-                     sizeof(timeout));
-#else
-          timeval tv;
-          tv.tv_sec = static_cast<long>(write_timeout_sec);
-          tv.tv_usec = static_cast<decltype(tv.tv_usec)>(write_timeout_usec);
-          setsockopt(sock2, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof(tv));
-#endif
-        }
-
-        error = Error::Success;
-        return true;
-      });
-
-  if (sock != INVALID_SOCKET) {
-    error = Error::Success;
-  } else {
-    if (error == Error::Success) { error = Error::Connection; }
-  }
-
-  return sock;
-}
-
-inline bool get_ip_and_port(const struct sockaddr_storage &addr,
-                            socklen_t addr_len, std::string &ip, int &port) {
-  if (addr.ss_family == AF_INET) {
-    port = ntohs(reinterpret_cast<const struct sockaddr_in *>(&addr)->sin_port);
-  } else if (addr.ss_family == AF_INET6) {
-    port =
-        ntohs(reinterpret_cast<const struct sockaddr_in6 *>(&addr)->sin6_port);
-  } else {
-    return false;
-  }
-
-  std::array<char, NI_MAXHOST> ipstr{};
-  if (getnameinfo(reinterpret_cast<const struct sockaddr *>(&addr), addr_len,
-                  ipstr.data(), static_cast<socklen_t>(ipstr.size()), nullptr,
-                  0, NI_NUMERICHOST)) {
-    return false;
-  }
-
-  ip = ipstr.data();
-  return true;
-}
-
-inline void get_local_ip_and_port(socket_t sock, std::string &ip, int &port) {
-  struct sockaddr_storage addr;
-  socklen_t addr_len = sizeof(addr);
-  if (!getsockname(sock, reinterpret_cast<struct sockaddr *>(&addr),
-                   &addr_len)) {
-    get_ip_and_port(addr, addr_len, ip, port);
-  }
-}
-
-inline void get_remote_ip_and_port(socket_t sock, std::string &ip, int &port) {
-  struct sockaddr_storage addr;
-  socklen_t addr_len = sizeof(addr);
-
-  if (!getpeername(sock, reinterpret_cast<struct sockaddr *>(&addr),
-                   &addr_len)) {
-#ifndef _WIN32
-    if (addr.ss_family == AF_UNIX) {
-#if defined(__linux__)
-      struct ucred ucred;
-      socklen_t len = sizeof(ucred);
-      if (getsockopt(sock, SOL_SOCKET, SO_PEERCRED, &ucred, &len) == 0) {
-        port = ucred.pid;
-      }
-#elif defined(SOL_LOCAL) && defined(SO_PEERPID) // __APPLE__
-      pid_t pid;
-      socklen_t len = sizeof(pid);
-      if (getsockopt(sock, SOL_LOCAL, SO_PEERPID, &pid, &len) == 0) {
-        port = pid;
-      }
-#endif
-      return;
-    }
-#endif
-    get_ip_and_port(addr, addr_len, ip, port);
-  }
-}
-
-inline constexpr unsigned int str2tag_core(const char *s, size_t l,
-                                           unsigned int h) {
-  return (l == 0)
-             ? h
-             : str2tag_core(
-                   s + 1, l - 1,
-                   // Unsets the 6 high bits of h, therefore no overflow happens
-                   (((std::numeric_limits<unsigned int>::max)() >> 6) &
-                    h * 33) ^
-                       static_cast<unsigned char>(*s));
-}
-
-inline unsigned int str2tag(const std::string &s) {
-  return str2tag_core(s.data(), s.size(), 0);
-}
-
-namespace udl {
-
-inline constexpr unsigned int operator"" _t(const char *s, size_t l) {
-  return str2tag_core(s, l, 0);
-}
-
-} // namespace udl
-
-inline const char *
-find_content_type(const std::string &path,
-                  const std::map<std::string, std::string> &user_data) {
-  auto ext = file_extension(path);
-
-  auto it = user_data.find(ext);
-  if (it != user_data.end()) { return it->second.c_str(); }
-
-  using udl::operator""_t;
-
-  switch (str2tag(ext)) {
-  default: return nullptr;
-  case "css"_t: return "text/css";
-  case "csv"_t: return "text/csv";
-  case "htm"_t:
-  case "html"_t: return "text/html";
-  case "js"_t:
-  case "mjs"_t: return "text/javascript";
-  case "txt"_t: return "text/plain";
-  case "vtt"_t: return "text/vtt";
-
-  case "apng"_t: return "image/apng";
-  case "avif"_t: return "image/avif";
-  case "bmp"_t: return "image/bmp";
-  case "gif"_t: return "image/gif";
-  case "png"_t: return "image/png";
-  case "svg"_t: return "image/svg+xml";
-  case "webp"_t: return "image/webp";
-  case "ico"_t: return "image/x-icon";
-  case "tif"_t: return "image/tiff";
-  case "tiff"_t: return "image/tiff";
-  case "jpg"_t:
-  case "jpeg"_t: return "image/jpeg";
-
-  case "mp4"_t: return "video/mp4";
-  case "mpeg"_t: return "video/mpeg";
-  case "webm"_t: return "video/webm";
-
-  case "mp3"_t: return "audio/mp3";
-  case "mpga"_t: return "audio/mpeg";
-  case "weba"_t: return "audio/webm";
-  case "wav"_t: return "audio/wave";
-
-  case "otf"_t: return "font/otf";
-  case "ttf"_t: return "font/ttf";
-  case "woff"_t: return "font/woff";
-  case "woff2"_t: return "font/woff2";
-
-  case "7z"_t: return "application/x-7z-compressed";
-  case "atom"_t: return "application/atom+xml";
-  case "pdf"_t: return "application/pdf";
-  case "json"_t: return "application/json";
-  case "rss"_t: return "application/rss+xml";
-  case "tar"_t: return "application/x-tar";
-  case "xht"_t:
-  case "xhtml"_t: return "application/xhtml+xml";
-  case "xslt"_t: return "application/xslt+xml";
-  case "xml"_t: return "application/xml";
-  case "gz"_t: return "application/gzip";
-  case "zip"_t: return "application/zip";
-  case "wasm"_t: return "application/wasm";
-  }
-}
-
-inline const char *status_message(int status) {
-  switch (status) {
-  case 100: return "Continue";
-  case 101: return "Switching Protocol";
-  case 102: return "Processing";
-  case 103: return "Early Hints";
-  case 200: return "OK";
-  case 201: return "Created";
-  case 202: return "Accepted";
-  case 203: return "Non-Authoritative Information";
-  case 204: return "No Content";
-  case 205: return "Reset Content";
-  case 206: return "Partial Content";
-  case 207: return "Multi-Status";
-  case 208: return "Already Reported";
-  case 226: return "IM Used";
-  case 300: return "Multiple Choice";
-  case 301: return "Moved Permanently";
-  case 302: return "Found";
-  case 303: return "See Other";
-  case 304: return "Not Modified";
-  case 305: return "Use Proxy";
-  case 306: return "unused";
-  case 307: return "Temporary Redirect";
-  case 308: return "Permanent Redirect";
-  case 400: return "Bad Request";
-  case 401: return "Unauthorized";
-  case 402: return "Payment Required";
-  case 403: return "Forbidden";
-  case 404: return "Not Found";
-  case 405: return "Method Not Allowed";
-  case 406: return "Not Acceptable";
-  case 407: return "Proxy Authentication Required";
-  case 408: return "Request Timeout";
-  case 409: return "Conflict";
-  case 410: return "Gone";
-  case 411: return "Length Required";
-  case 412: return "Precondition Failed";
-  case 413: return "Payload Too Large";
-  case 414: return "URI Too Long";
-  case 415: return "Unsupported Media Type";
-  case 416: return "Range Not Satisfiable";
-  case 417: return "Expectation Failed";
-  case 418: return "I'm a teapot";
-  case 421: return "Misdirected Request";
-  case 422: return "Unprocessable Entity";
-  case 423: return "Locked";
-  case 424: return "Failed Dependency";
-  case 425: return "Too Early";
-  case 426: return "Upgrade Required";
-  case 428: return "Precondition Required";
-  case 429: return "Too Many Requests";
-  case 431: return "Request Header Fields Too Large";
-  case 451: return "Unavailable For Legal Reasons";
-  case 501: return "Not Implemented";
-  case 502: return "Bad Gateway";
-  case 503: return "Service Unavailable";
-  case 504: return "Gateway Timeout";
-  case 505: return "HTTP Version Not Supported";
-  case 506: return "Variant Also Negotiates";
-  case 507: return "Insufficient Storage";
-  case 508: return "Loop Detected";
-  case 510: return "Not Extended";
-  case 511: return "Network Authentication Required";
-
-  default:
-  case 500: return "Internal Server Error";
-  }
-}
-
-inline bool can_compress_content_type(const std::string &content_type) {
-  using udl::operator""_t;
-
-  auto tag = str2tag(content_type);
-
-  switch (tag) {
-  case "image/svg+xml"_t:
-  case "application/javascript"_t:
-  case "application/json"_t:
-  case "application/xml"_t:
-  case "application/protobuf"_t:
-  case "application/xhtml+xml"_t: return true;
-
-  default:
-    return !content_type.rfind("text/", 0) && tag != "text/event-stream"_t;
-  }
-}
-
-inline EncodingType encoding_type(const Request &req, const Response &res) {
-  auto ret =
-      detail::can_compress_content_type(res.get_header_value("Content-Type"));
-  if (!ret) { return EncodingType::None; }
-
-  const auto &s = req.get_header_value("Accept-Encoding");
-  (void)(s);
-
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-  // TODO: 'Accept-Encoding' has br, not br;q=0
-  ret = s.find("br") != std::string::npos;
-  if (ret) { return EncodingType::Brotli; }
-#endif
-
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-  // TODO: 'Accept-Encoding' has gzip, not gzip;q=0
-  ret = s.find("gzip") != std::string::npos;
-  if (ret) { return EncodingType::Gzip; }
-#endif
-
-  return EncodingType::None;
-}
-
-inline bool nocompressor::compress(const char *data, size_t data_length,
-                                   bool /*last*/, Callback callback) {
-  if (!data_length) { return true; }
-  return callback(data, data_length);
-}
-
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-inline gzip_compressor::gzip_compressor() {
-  std::memset(&strm_, 0, sizeof(strm_));
-  strm_.zalloc = Z_NULL;
-  strm_.zfree = Z_NULL;
-  strm_.opaque = Z_NULL;
-
-  is_valid_ = deflateInit2(&strm_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 31, 8,
-                           Z_DEFAULT_STRATEGY) == Z_OK;
-}
-
-inline gzip_compressor::~gzip_compressor() { deflateEnd(&strm_); }
-
-inline bool gzip_compressor::compress(const char *data, size_t data_length,
-                                      bool last, Callback callback) {
-  assert(is_valid_);
-
-  do {
-    constexpr size_t max_avail_in =
-        (std::numeric_limits<decltype(strm_.avail_in)>::max)();
-
-    strm_.avail_in = static_cast<decltype(strm_.avail_in)>(
-        (std::min)(data_length, max_avail_in));
-    strm_.next_in = const_cast<Bytef *>(reinterpret_cast<const Bytef *>(data));
-
-    data_length -= strm_.avail_in;
-    data += strm_.avail_in;
-
-    auto flush = (last && data_length == 0) ? Z_FINISH : Z_NO_FLUSH;
-    int ret = Z_OK;
-
-    std::array<char, CPPHTTPLIB_COMPRESSION_BUFSIZ> buff{};
-    do {
-      strm_.avail_out = static_cast<uInt>(buff.size());
-      strm_.next_out = reinterpret_cast<Bytef *>(buff.data());
-
-      ret = deflate(&strm_, flush);
-      if (ret == Z_STREAM_ERROR) { return false; }
-
-      if (!callback(buff.data(), buff.size() - strm_.avail_out)) {
-        return false;
-      }
-    } while (strm_.avail_out == 0);
-
-    assert((flush == Z_FINISH && ret == Z_STREAM_END) ||
-           (flush == Z_NO_FLUSH && ret == Z_OK));
-    assert(strm_.avail_in == 0);
-  } while (data_length > 0);
-
-  return true;
-}
-
-inline gzip_decompressor::gzip_decompressor() {
-  std::memset(&strm_, 0, sizeof(strm_));
-  strm_.zalloc = Z_NULL;
-  strm_.zfree = Z_NULL;
-  strm_.opaque = Z_NULL;
-
-  // 15 is the value of wbits, which should be at the maximum possible value
-  // to ensure that any gzip stream can be decoded. The offset of 32 specifies
-  // that the stream type should be automatically detected either gzip or
-  // deflate.
-  is_valid_ = inflateInit2(&strm_, 32 + 15) == Z_OK;
-}
-
-inline gzip_decompressor::~gzip_decompressor() { inflateEnd(&strm_); }
-
-inline bool gzip_decompressor::is_valid() const { return is_valid_; }
-
-inline bool gzip_decompressor::decompress(const char *data, size_t data_length,
-                                          Callback callback) {
-  assert(is_valid_);
-
-  int ret = Z_OK;
-
-  do {
-    constexpr size_t max_avail_in =
-        (std::numeric_limits<decltype(strm_.avail_in)>::max)();
-
-    strm_.avail_in = static_cast<decltype(strm_.avail_in)>(
-        (std::min)(data_length, max_avail_in));
-    strm_.next_in = const_cast<Bytef *>(reinterpret_cast<const Bytef *>(data));
-
-    data_length -= strm_.avail_in;
-    data += strm_.avail_in;
-
-    std::array<char, CPPHTTPLIB_COMPRESSION_BUFSIZ> buff{};
-    while (strm_.avail_in > 0) {
-      strm_.avail_out = static_cast<uInt>(buff.size());
-      strm_.next_out = reinterpret_cast<Bytef *>(buff.data());
-
-      auto prev_avail_in = strm_.avail_in;
-
-      ret = inflate(&strm_, Z_NO_FLUSH);
-
-      if (prev_avail_in - strm_.avail_in == 0) { return false; }
-
-      assert(ret != Z_STREAM_ERROR);
-      switch (ret) {
-      case Z_NEED_DICT:
-      case Z_DATA_ERROR:
-      case Z_MEM_ERROR: inflateEnd(&strm_); return false;
-      }
-
-      if (!callback(buff.data(), buff.size() - strm_.avail_out)) {
-        return false;
-      }
-    }
-
-    if (ret != Z_OK && ret != Z_STREAM_END) return false;
-
-  } while (data_length > 0);
-
-  return true;
-}
-#endif
-
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-inline brotli_compressor::brotli_compressor() {
-  state_ = BrotliEncoderCreateInstance(nullptr, nullptr, nullptr);
-}
-
-inline brotli_compressor::~brotli_compressor() {
-  BrotliEncoderDestroyInstance(state_);
-}
-
-inline bool brotli_compressor::compress(const char *data, size_t data_length,
-                                        bool last, Callback callback) {
-  std::array<uint8_t, CPPHTTPLIB_COMPRESSION_BUFSIZ> buff{};
-
-  auto operation = last ? BROTLI_OPERATION_FINISH : BROTLI_OPERATION_PROCESS;
-  auto available_in = data_length;
-  auto next_in = reinterpret_cast<const uint8_t *>(data);
-
-  for (;;) {
-    if (last) {
-      if (BrotliEncoderIsFinished(state_)) { break; }
-    } else {
-      if (!available_in) { break; }
-    }
-
-    auto available_out = buff.size();
-    auto next_out = buff.data();
-
-    if (!BrotliEncoderCompressStream(state_, operation, &available_in, &next_in,
-                                     &available_out, &next_out, nullptr)) {
-      return false;
-    }
-
-    auto output_bytes = buff.size() - available_out;
-    if (output_bytes) {
-      callback(reinterpret_cast<const char *>(buff.data()), output_bytes);
-    }
-  }
-
-  return true;
-}
-
-inline brotli_decompressor::brotli_decompressor() {
-  decoder_s = BrotliDecoderCreateInstance(0, 0, 0);
-  decoder_r = decoder_s ? BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT
-                        : BROTLI_DECODER_RESULT_ERROR;
-}
-
-inline brotli_decompressor::~brotli_decompressor() {
-  if (decoder_s) { BrotliDecoderDestroyInstance(decoder_s); }
-}
-
-inline bool brotli_decompressor::is_valid() const { return decoder_s; }
-
-inline bool brotli_decompressor::decompress(const char *data,
-                                            size_t data_length,
-                                            Callback callback) {
-  if (decoder_r == BROTLI_DECODER_RESULT_SUCCESS ||
-      decoder_r == BROTLI_DECODER_RESULT_ERROR) {
-    return 0;
-  }
-
-  const uint8_t *next_in = (const uint8_t *)data;
-  size_t avail_in = data_length;
-  size_t total_out;
-
-  decoder_r = BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT;
-
-  std::array<char, CPPHTTPLIB_COMPRESSION_BUFSIZ> buff{};
-  while (decoder_r == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) {
-    char *next_out = buff.data();
-    size_t avail_out = buff.size();
-
-    decoder_r = BrotliDecoderDecompressStream(
-        decoder_s, &avail_in, &next_in, &avail_out,
-        reinterpret_cast<uint8_t **>(&next_out), &total_out);
-
-    if (decoder_r == BROTLI_DECODER_RESULT_ERROR) { return false; }
-
-    if (!callback(buff.data(), buff.size() - avail_out)) { return false; }
-  }
-
-  return decoder_r == BROTLI_DECODER_RESULT_SUCCESS ||
-         decoder_r == BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT;
-}
-#endif
-
-inline bool has_header(const Headers &headers, const std::string &key) {
-  return headers.find(key) != headers.end();
-}
-
-inline const char *get_header_value(const Headers &headers,
-                                    const std::string &key, size_t id,
-                                    const char *def) {
-  auto rng = headers.equal_range(key);
-  auto it = rng.first;
-  std::advance(it, static_cast<ssize_t>(id));
-  if (it != rng.second) { return it->second.c_str(); }
-  return def;
-}
-
-inline bool compare_case_ignore(const std::string &a, const std::string &b) {
-  if (a.size() != b.size()) { return false; }
-  for (size_t i = 0; i < b.size(); i++) {
-    if (::tolower(a[i]) != ::tolower(b[i])) { return false; }
-  }
-  return true;
-}
-
-template <typename T>
-inline bool parse_header(const char *beg, const char *end, T fn) {
-  // Skip trailing spaces and tabs.
-  while (beg < end && is_space_or_tab(end[-1])) {
-    end--;
-  }
-
-  auto p = beg;
-  while (p < end && *p != ':') {
-    p++;
-  }
-
-  if (p == end) { return false; }
-
-  auto key_end = p;
-
-  if (*p++ != ':') { return false; }
-
-  while (p < end && is_space_or_tab(*p)) {
-    p++;
-  }
-
-  if (p < end) {
-    auto key = std::string(beg, key_end);
-    auto val = compare_case_ignore(key, "Location")
-                   ? std::string(p, end)
-                   : decode_url(std::string(p, end), false);
-    fn(std::move(key), std::move(val));
-    return true;
-  }
-
-  return false;
-}
-
-inline bool read_headers(Stream &strm, Headers &headers) {
-  const auto bufsiz = 2048;
-  char buf[bufsiz];
-  stream_line_reader line_reader(strm, buf, bufsiz);
-
-  for (;;) {
-    if (!line_reader.getline()) { return false; }
-
-    // Check if the line ends with CRLF.
-    auto line_terminator_len = 2;
-    if (line_reader.end_with_crlf()) {
-      // Blank line indicates end of headers.
-      if (line_reader.size() == 2) { break; }
-#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR
-    } else {
-      // Blank line indicates end of headers.
-      if (line_reader.size() == 1) { break; }
-      line_terminator_len = 1;
-    }
-#else
-    } else {
-      continue; // Skip invalid line.
-    }
-#endif
-
-    if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; }
-
-    // Exclude line terminator
-    auto end = line_reader.ptr() + line_reader.size() - line_terminator_len;
-
-    parse_header(line_reader.ptr(), end,
-                 [&](std::string &&key, std::string &&val) {
-                   headers.emplace(std::move(key), std::move(val));
-                 });
-  }
-
-  return true;
-}
-
-inline bool read_content_with_length(Stream &strm, uint64_t len,
-                                     Progress progress,
-                                     ContentReceiverWithProgress out) {
-  char buf[CPPHTTPLIB_RECV_BUFSIZ];
-
-  uint64_t r = 0;
-  while (r < len) {
-    auto read_len = static_cast<size_t>(len - r);
-    auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ));
-    if (n <= 0) { return false; }
-
-    if (!out(buf, static_cast<size_t>(n), r, len)) { return false; }
-    r += static_cast<uint64_t>(n);
-
-    if (progress) {
-      if (!progress(r, len)) { return false; }
-    }
-  }
-
-  return true;
-}
-
-inline void skip_content_with_length(Stream &strm, uint64_t len) {
-  char buf[CPPHTTPLIB_RECV_BUFSIZ];
-  uint64_t r = 0;
-  while (r < len) {
-    auto read_len = static_cast<size_t>(len - r);
-    auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ));
-    if (n <= 0) { return; }
-    r += static_cast<uint64_t>(n);
-  }
-}
-
-inline bool read_content_without_length(Stream &strm,
-                                        ContentReceiverWithProgress out) {
-  char buf[CPPHTTPLIB_RECV_BUFSIZ];
-  uint64_t r = 0;
-  for (;;) {
-    auto n = strm.read(buf, CPPHTTPLIB_RECV_BUFSIZ);
-    if (n < 0) {
-      return false;
-    } else if (n == 0) {
-      return true;
-    }
-
-    if (!out(buf, static_cast<size_t>(n), r, 0)) { return false; }
-    r += static_cast<uint64_t>(n);
-  }
-
-  return true;
-}
-
-template <typename T>
-inline bool read_content_chunked(Stream &strm, T &x,
-                                 ContentReceiverWithProgress out) {
-  const auto bufsiz = 16;
-  char buf[bufsiz];
-
-  stream_line_reader line_reader(strm, buf, bufsiz);
-
-  if (!line_reader.getline()) { return false; }
-
-  unsigned long chunk_len;
-  while (true) {
-    char *end_ptr;
-
-    chunk_len = std::strtoul(line_reader.ptr(), &end_ptr, 16);
-
-    if (end_ptr == line_reader.ptr()) { return false; }
-    if (chunk_len == ULONG_MAX) { return false; }
-
-    if (chunk_len == 0) { break; }
-
-    if (!read_content_with_length(strm, chunk_len, nullptr, out)) {
-      return false;
-    }
-
-    if (!line_reader.getline()) { return false; }
-
-    if (strcmp(line_reader.ptr(), "\r\n")) { return false; }
-
-    if (!line_reader.getline()) { return false; }
-  }
-
-  assert(chunk_len == 0);
-
-  // Trailer
-  if (!line_reader.getline()) { return false; }
-
-  while (strcmp(line_reader.ptr(), "\r\n")) {
-    if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; }
-
-    // Exclude line terminator
-    constexpr auto line_terminator_len = 2;
-    auto end = line_reader.ptr() + line_reader.size() - line_terminator_len;
-
-    parse_header(line_reader.ptr(), end,
-                 [&](std::string &&key, std::string &&val) {
-                   x.headers.emplace(std::move(key), std::move(val));
-                 });
-
-    if (!line_reader.getline()) { return false; }
-  }
-
-  return true;
-}
-
-inline bool is_chunked_transfer_encoding(const Headers &headers) {
-  return !strcasecmp(get_header_value(headers, "Transfer-Encoding", 0, ""),
-                     "chunked");
-}
-
-template <typename T, typename U>
-bool prepare_content_receiver(T &x, int &status,
-                              ContentReceiverWithProgress receiver,
-                              bool decompress, U callback) {
-  if (decompress) {
-    std::string encoding = x.get_header_value("Content-Encoding");
-    std::unique_ptr<decompressor> decompressor;
-
-    if (encoding == "gzip" || encoding == "deflate") {
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-      decompressor = detail::make_unique<gzip_decompressor>();
-#else
-      status = 415;
-      return false;
-#endif
-    } else if (encoding.find("br") != std::string::npos) {
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-      decompressor = detail::make_unique<brotli_decompressor>();
-#else
-      status = 415;
-      return false;
-#endif
-    }
-
-    if (decompressor) {
-      if (decompressor->is_valid()) {
-        ContentReceiverWithProgress out = [&](const char *buf, size_t n,
-                                              uint64_t off, uint64_t len) {
-          return decompressor->decompress(buf, n,
-                                          [&](const char *buf2, size_t n2) {
-                                            return receiver(buf2, n2, off, len);
-                                          });
-        };
-        return callback(std::move(out));
-      } else {
-        status = 500;
-        return false;
-      }
-    }
-  }
-
-  ContentReceiverWithProgress out = [&](const char *buf, size_t n, uint64_t off,
-                                        uint64_t len) {
-    return receiver(buf, n, off, len);
-  };
-  return callback(std::move(out));
-}
-
-template <typename T>
-bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status,
-                  Progress progress, ContentReceiverWithProgress receiver,
-                  bool decompress) {
-  return prepare_content_receiver(
-      x, status, std::move(receiver), decompress,
-      [&](const ContentReceiverWithProgress &out) {
-        auto ret = true;
-        auto exceed_payload_max_length = false;
-
-        if (is_chunked_transfer_encoding(x.headers)) {
-          ret = read_content_chunked(strm, x, out);
-        } else if (!has_header(x.headers, "Content-Length")) {
-          ret = read_content_without_length(strm, out);
-        } else {
-          auto len = get_header_value<uint64_t>(x.headers, "Content-Length");
-          if (len > payload_max_length) {
-            exceed_payload_max_length = true;
-            skip_content_with_length(strm, len);
-            ret = false;
-          } else if (len > 0) {
-            ret = read_content_with_length(strm, len, std::move(progress), out);
-          }
-        }
-
-        if (!ret) { status = exceed_payload_max_length ? 413 : 400; }
-        return ret;
-      });
-} // namespace detail
-
-inline ssize_t write_headers(Stream &strm, const Headers &headers) {
-  ssize_t write_len = 0;
-  for (const auto &x : headers) {
-    auto len =
-        strm.write_format("%s: %s\r\n", x.first.c_str(), x.second.c_str());
-    if (len < 0) { return len; }
-    write_len += len;
-  }
-  auto len = strm.write("\r\n");
-  if (len < 0) { return len; }
-  write_len += len;
-  return write_len;
-}
-
-inline bool write_data(Stream &strm, const char *d, size_t l) {
-  size_t offset = 0;
-  while (offset < l) {
-    auto length = strm.write(d + offset, l - offset);
-    if (length < 0) { return false; }
-    offset += static_cast<size_t>(length);
-  }
-  return true;
-}
-
-template <typename T>
-inline bool write_content(Stream &strm, const ContentProvider &content_provider,
-                          size_t offset, size_t length, T is_shutting_down,
-                          Error &error) {
-  size_t end_offset = offset + length;
-  auto ok = true;
-  DataSink data_sink;
-
-  data_sink.write = [&](const char *d, size_t l) -> bool {
-    if (ok) {
-      if (strm.is_writable() && write_data(strm, d, l)) {
-        offset += l;
-      } else {
-        ok = false;
-      }
-    }
-    return ok;
-  };
-
-  while (offset < end_offset && !is_shutting_down()) {
-    if (!strm.is_writable()) {
-      error = Error::Write;
-      return false;
-    } else if (!content_provider(offset, end_offset - offset, data_sink)) {
-      error = Error::Canceled;
-      return false;
-    } else if (!ok) {
-      error = Error::Write;
-      return false;
-    }
-  }
-
-  error = Error::Success;
-  return true;
-}
-
-template <typename T>
-inline bool write_content(Stream &strm, const ContentProvider &content_provider,
-                          size_t offset, size_t length,
-                          const T &is_shutting_down) {
-  auto error = Error::Success;
-  return write_content(strm, content_provider, offset, length, is_shutting_down,
-                       error);
-}
-
-template <typename T>
-inline bool
-write_content_without_length(Stream &strm,
-                             const ContentProvider &content_provider,
-                             const T &is_shutting_down) {
-  size_t offset = 0;
-  auto data_available = true;
-  auto ok = true;
-  DataSink data_sink;
-
-  data_sink.write = [&](const char *d, size_t l) -> bool {
-    if (ok) {
-      offset += l;
-      if (!strm.is_writable() || !write_data(strm, d, l)) { ok = false; }
-    }
-    return ok;
-  };
-
-  data_sink.done = [&](void) { data_available = false; };
-
-  while (data_available && !is_shutting_down()) {
-    if (!strm.is_writable()) {
-      return false;
-    } else if (!content_provider(offset, 0, data_sink)) {
-      return false;
-    } else if (!ok) {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <typename T, typename U>
-inline bool
-write_content_chunked(Stream &strm, const ContentProvider &content_provider,
-                      const T &is_shutting_down, U &compressor, Error &error) {
-  size_t offset = 0;
-  auto data_available = true;
-  auto ok = true;
-  DataSink data_sink;
-
-  data_sink.write = [&](const char *d, size_t l) -> bool {
-    if (ok) {
-      data_available = l > 0;
-      offset += l;
-
-      std::string payload;
-      if (compressor.compress(d, l, false,
-                              [&](const char *data, size_t data_len) {
-                                payload.append(data, data_len);
-                                return true;
-                              })) {
-        if (!payload.empty()) {
-          // Emit chunked response header and footer for each chunk
-          auto chunk =
-              from_i_to_hex(payload.size()) + "\r\n" + payload + "\r\n";
-          if (!strm.is_writable() ||
-              !write_data(strm, chunk.data(), chunk.size())) {
-            ok = false;
-          }
-        }
-      } else {
-        ok = false;
-      }
-    }
-    return ok;
-  };
-
-  auto done_with_trailer = [&](const Headers *trailer) {
-    if (!ok) { return; }
-
-    data_available = false;
-
-    std::string payload;
-    if (!compressor.compress(nullptr, 0, true,
-                             [&](const char *data, size_t data_len) {
-                               payload.append(data, data_len);
-                               return true;
-                             })) {
-      ok = false;
-      return;
-    }
-
-    if (!payload.empty()) {
-      // Emit chunked response header and footer for each chunk
-      auto chunk = from_i_to_hex(payload.size()) + "\r\n" + payload + "\r\n";
-      if (!strm.is_writable() ||
-          !write_data(strm, chunk.data(), chunk.size())) {
-        ok = false;
-        return;
-      }
-    }
-
-    static const std::string done_marker("0\r\n");
-    if (!write_data(strm, done_marker.data(), done_marker.size())) {
-      ok = false;
-    }
-
-    // Trailer
-    if (trailer) {
-      for (const auto &kv : *trailer) {
-        std::string field_line = kv.first + ": " + kv.second + "\r\n";
-        if (!write_data(strm, field_line.data(), field_line.size())) {
-          ok = false;
-        }
-      }
-    }
-
-    static const std::string crlf("\r\n");
-    if (!write_data(strm, crlf.data(), crlf.size())) { ok = false; }
-  };
-
-  data_sink.done = [&](void) { done_with_trailer(nullptr); };
-
-  data_sink.done_with_trailer = [&](const Headers &trailer) {
-    done_with_trailer(&trailer);
-  };
-
-  while (data_available && !is_shutting_down()) {
-    if (!strm.is_writable()) {
-      error = Error::Write;
-      return false;
-    } else if (!content_provider(offset, 0, data_sink)) {
-      error = Error::Canceled;
-      return false;
-    } else if (!ok) {
-      error = Error::Write;
-      return false;
-    }
-  }
-
-  error = Error::Success;
-  return true;
-}
-
-template <typename T, typename U>
-inline bool write_content_chunked(Stream &strm,
-                                  const ContentProvider &content_provider,
-                                  const T &is_shutting_down, U &compressor) {
-  auto error = Error::Success;
-  return write_content_chunked(strm, content_provider, is_shutting_down,
-                               compressor, error);
-}
-
-template <typename T>
-inline bool redirect(T &cli, Request &req, Response &res,
-                     const std::string &path, const std::string &location,
-                     Error &error) {
-  Request new_req = req;
-  new_req.path = path;
-  new_req.redirect_count_ -= 1;
-
-  if (res.status == 303 && (req.method != "GET" && req.method != "HEAD")) {
-    new_req.method = "GET";
-    new_req.body.clear();
-    new_req.headers.clear();
-  }
-
-  Response new_res;
-
-  auto ret = cli.send(new_req, new_res, error);
-  if (ret) {
-    req = new_req;
-    res = new_res;
-    res.location = location;
-  }
-  return ret;
-}
-
-inline std::string params_to_query_str(const Params &params) {
-  std::string query;
-
-  for (auto it = params.begin(); it != params.end(); ++it) {
-    if (it != params.begin()) { query += "&"; }
-    query += it->first;
-    query += "=";
-    query += encode_query_param(it->second);
-  }
-  return query;
-}
-
-inline void parse_query_text(const std::string &s, Params &params) {
-  std::set<std::string> cache;
-  split(s.data(), s.data() + s.size(), '&', [&](const char *b, const char *e) {
-    std::string kv(b, e);
-    if (cache.find(kv) != cache.end()) { return; }
-    cache.insert(kv);
-
-    std::string key;
-    std::string val;
-    split(b, e, '=', [&](const char *b2, const char *e2) {
-      if (key.empty()) {
-        key.assign(b2, e2);
-      } else {
-        val.assign(b2, e2);
-      }
-    });
-
-    if (!key.empty()) {
-      params.emplace(decode_url(key, true), decode_url(val, true));
-    }
-  });
-}
-
-inline bool parse_multipart_boundary(const std::string &content_type,
-                                     std::string &boundary) {
-  auto boundary_keyword = "boundary=";
-  auto pos = content_type.find(boundary_keyword);
-  if (pos == std::string::npos) { return false; }
-  auto end = content_type.find(';', pos);
-  auto beg = pos + strlen(boundary_keyword);
-  boundary = content_type.substr(beg, end - beg);
-  if (boundary.length() >= 2 && boundary.front() == '"' &&
-      boundary.back() == '"') {
-    boundary = boundary.substr(1, boundary.size() - 2);
-  }
-  return !boundary.empty();
-}
-
-#ifdef CPPHTTPLIB_NO_EXCEPTIONS
-inline bool parse_range_header(const std::string &s, Ranges &ranges) {
-#else
-inline bool parse_range_header(const std::string &s, Ranges &ranges) try {
-#endif
-  static auto re_first_range = std::regex(R"(bytes=(\d*-\d*(?:,\s*\d*-\d*)*))");
-  std::smatch m;
-  if (std::regex_match(s, m, re_first_range)) {
-    auto pos = static_cast<size_t>(m.position(1));
-    auto len = static_cast<size_t>(m.length(1));
-    bool all_valid_ranges = true;
-    split(&s[pos], &s[pos + len], ',', [&](const char *b, const char *e) {
-      if (!all_valid_ranges) return;
-      static auto re_another_range = std::regex(R"(\s*(\d*)-(\d*))");
-      std::cmatch cm;
-      if (std::regex_match(b, e, cm, re_another_range)) {
-        ssize_t first = -1;
-        if (!cm.str(1).empty()) {
-          first = static_cast<ssize_t>(std::stoll(cm.str(1)));
-        }
-
-        ssize_t last = -1;
-        if (!cm.str(2).empty()) {
-          last = static_cast<ssize_t>(std::stoll(cm.str(2)));
-        }
-
-        if (first != -1 && last != -1 && first > last) {
-          all_valid_ranges = false;
-          return;
-        }
-        ranges.emplace_back(std::make_pair(first, last));
-      }
-    });
-    return all_valid_ranges;
-  }
-  return false;
-#ifdef CPPHTTPLIB_NO_EXCEPTIONS
-}
-#else
-} catch (...) { return false; }
-#endif
-
-class MultipartFormDataParser {
-public:
-  MultipartFormDataParser() = default;
-
-  void set_boundary(std::string &&boundary) {
-    boundary_ = boundary;
-    dash_boundary_crlf_ = dash_ + boundary_ + crlf_;
-    crlf_dash_boundary_ = crlf_ + dash_ + boundary_;
-  }
-
-  bool is_valid() const { return is_valid_; }
-
-  bool parse(const char *buf, size_t n, const ContentReceiver &content_callback,
-             const MultipartContentHeader &header_callback) {
-
-    // TODO: support 'filename*'
-    static const std::regex re_content_disposition(
-        R"~(^Content-Disposition:\s*form-data;\s*name="(.*?)"(?:;\s*filename="(.*?)")?(?:;\s*filename\*=\S+)?\s*$)~",
-        std::regex_constants::icase);
-
-    buf_append(buf, n);
-
-    while (buf_size() > 0) {
-      switch (state_) {
-      case 0: { // Initial boundary
-        buf_erase(buf_find(dash_boundary_crlf_));
-        if (dash_boundary_crlf_.size() > buf_size()) { return true; }
-        if (!buf_start_with(dash_boundary_crlf_)) { return false; }
-        buf_erase(dash_boundary_crlf_.size());
-        state_ = 1;
-        break;
-      }
-      case 1: { // New entry
-        clear_file_info();
-        state_ = 2;
-        break;
-      }
-      case 2: { // Headers
-        auto pos = buf_find(crlf_);
-        if (pos > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; }
-        while (pos < buf_size()) {
-          // Empty line
-          if (pos == 0) {
-            if (!header_callback(file_)) {
-              is_valid_ = false;
-              return false;
-            }
-            buf_erase(crlf_.size());
-            state_ = 3;
-            break;
-          }
-
-          static const std::string header_name = "content-type:";
-          const auto header = buf_head(pos);
-          if (start_with_case_ignore(header, header_name)) {
-            file_.content_type = trim_copy(header.substr(header_name.size()));
-          } else {
-            std::smatch m;
-            if (std::regex_match(header, m, re_content_disposition)) {
-              file_.name = m[1];
-              file_.filename = m[2];
-            } else {
-              is_valid_ = false;
-              return false;
-            }
-          }
-          buf_erase(pos + crlf_.size());
-          pos = buf_find(crlf_);
-        }
-        if (state_ != 3) { return true; }
-        break;
-      }
-      case 3: { // Body
-        if (crlf_dash_boundary_.size() > buf_size()) { return true; }
-        auto pos = buf_find(crlf_dash_boundary_);
-        if (pos < buf_size()) {
-          if (!content_callback(buf_data(), pos)) {
-            is_valid_ = false;
-            return false;
-          }
-          buf_erase(pos + crlf_dash_boundary_.size());
-          state_ = 4;
-        } else {
-          auto len = buf_size() - crlf_dash_boundary_.size();
-          if (len > 0) {
-            if (!content_callback(buf_data(), len)) {
-              is_valid_ = false;
-              return false;
-            }
-            buf_erase(len);
-          }
-          return true;
-        }
-        break;
-      }
-      case 4: { // Boundary
-        if (crlf_.size() > buf_size()) { return true; }
-        if (buf_start_with(crlf_)) {
-          buf_erase(crlf_.size());
-          state_ = 1;
-        } else {
-          if (dash_crlf_.size() > buf_size()) { return true; }
-          if (buf_start_with(dash_crlf_)) {
-            buf_erase(dash_crlf_.size());
-            is_valid_ = true;
-            buf_erase(buf_size()); // Remove epilogue
-          } else {
-            return true;
-          }
-        }
-        break;
-      }
-      }
-    }
-
-    return true;
-  }
-
-private:
-  void clear_file_info() {
-    file_.name.clear();
-    file_.filename.clear();
-    file_.content_type.clear();
-  }
-
-  bool start_with_case_ignore(const std::string &a,
-                              const std::string &b) const {
-    if (a.size() < b.size()) { return false; }
-    for (size_t i = 0; i < b.size(); i++) {
-      if (::tolower(a[i]) != ::tolower(b[i])) { return false; }
-    }
-    return true;
-  }
-
-  const std::string dash_ = "--";
-  const std::string crlf_ = "\r\n";
-  const std::string dash_crlf_ = "--\r\n";
-  std::string boundary_;
-  std::string dash_boundary_crlf_;
-  std::string crlf_dash_boundary_;
-
-  size_t state_ = 0;
-  bool is_valid_ = false;
-  MultipartFormData file_;
-
-  // Buffer
-  bool start_with(const std::string &a, size_t spos, size_t epos,
-                  const std::string &b) const {
-    if (epos - spos < b.size()) { return false; }
-    for (size_t i = 0; i < b.size(); i++) {
-      if (a[i + spos] != b[i]) { return false; }
-    }
-    return true;
-  }
-
-  size_t buf_size() const { return buf_epos_ - buf_spos_; }
-
-  const char *buf_data() const { return &buf_[buf_spos_]; }
-
-  std::string buf_head(size_t l) const { return buf_.substr(buf_spos_, l); }
-
-  bool buf_start_with(const std::string &s) const {
-    return start_with(buf_, buf_spos_, buf_epos_, s);
-  }
-
-  size_t buf_find(const std::string &s) const {
-    auto c = s.front();
-
-    size_t off = buf_spos_;
-    while (off < buf_epos_) {
-      auto pos = off;
-      while (true) {
-        if (pos == buf_epos_) { return buf_size(); }
-        if (buf_[pos] == c) { break; }
-        pos++;
-      }
-
-      auto remaining_size = buf_epos_ - pos;
-      if (s.size() > remaining_size) { return buf_size(); }
-
-      if (start_with(buf_, pos, buf_epos_, s)) { return pos - buf_spos_; }
-
-      off = pos + 1;
-    }
-
-    return buf_size();
-  }
-
-  void buf_append(const char *data, size_t n) {
-    auto remaining_size = buf_size();
-    if (remaining_size > 0 && buf_spos_ > 0) {
-      for (size_t i = 0; i < remaining_size; i++) {
-        buf_[i] = buf_[buf_spos_ + i];
-      }
-    }
-    buf_spos_ = 0;
-    buf_epos_ = remaining_size;
-
-    if (remaining_size + n > buf_.size()) { buf_.resize(remaining_size + n); }
-
-    for (size_t i = 0; i < n; i++) {
-      buf_[buf_epos_ + i] = data[i];
-    }
-    buf_epos_ += n;
-  }
-
-  void buf_erase(size_t size) { buf_spos_ += size; }
-
-  std::string buf_;
-  size_t buf_spos_ = 0;
-  size_t buf_epos_ = 0;
-};
-
-inline std::string to_lower(const char *beg, const char *end) {
-  std::string out;
-  auto it = beg;
-  while (it != end) {
-    out += static_cast<char>(::tolower(*it));
-    it++;
-  }
-  return out;
-}
-
-inline std::string make_multipart_data_boundary() {
-  static const char data[] =
-      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
-  // std::random_device might actually be deterministic on some
-  // platforms, but due to lack of support in the c++ standard library,
-  // doing better requires either some ugly hacks or breaking portability.
-  std::random_device seed_gen;
-
-  // Request 128 bits of entropy for initialization
-  std::seed_seq seed_sequence{seed_gen(), seed_gen(), seed_gen(), seed_gen()};
-  std::mt19937 engine(seed_sequence);
-
-  std::string result = "--cpp-httplib-multipart-data-";
-
-  for (auto i = 0; i < 16; i++) {
-    result += data[engine() % (sizeof(data) - 1)];
-  }
-
-  return result;
-}
-
-inline bool is_multipart_boundary_chars_valid(const std::string &boundary) {
-  auto valid = true;
-  for (size_t i = 0; i < boundary.size(); i++) {
-    auto c = boundary[i];
-    if (!std::isalnum(c) && c != '-' && c != '_') {
-      valid = false;
-      break;
-    }
-  }
-  return valid;
-}
-
-template <typename T>
-inline std::string
-serialize_multipart_formdata_item_begin(const T &item,
-                                        const std::string &boundary) {
-  std::string body = "--" + boundary + "\r\n";
-  body += "Content-Disposition: form-data; name=\"" + item.name + "\"";
-  if (!item.filename.empty()) {
-    body += "; filename=\"" + item.filename + "\"";
-  }
-  body += "\r\n";
-  if (!item.content_type.empty()) {
-    body += "Content-Type: " + item.content_type + "\r\n";
-  }
-  body += "\r\n";
-
-  return body;
-}
-
-inline std::string serialize_multipart_formdata_item_end() { return "\r\n"; }
-
-inline std::string
-serialize_multipart_formdata_finish(const std::string &boundary) {
-  return "--" + boundary + "--\r\n";
-}
-
-inline std::string
-serialize_multipart_formdata_get_content_type(const std::string &boundary) {
-  return "multipart/form-data; boundary=" + boundary;
-}
-
-inline std::string
-serialize_multipart_formdata(const MultipartFormDataItems &items,
-                             const std::string &boundary, bool finish = true) {
-  std::string body;
-
-  for (const auto &item : items) {
-    body += serialize_multipart_formdata_item_begin(item, boundary);
-    body += item.content + serialize_multipart_formdata_item_end();
-  }
-
-  if (finish) body += serialize_multipart_formdata_finish(boundary);
-
-  return body;
-}
-
-inline std::pair<size_t, size_t>
-get_range_offset_and_length(const Request &req, size_t content_length,
-                            size_t index) {
-  auto r = req.ranges[index];
-
-  if (r.first == -1 && r.second == -1) {
-    return std::make_pair(0, content_length);
-  }
-
-  auto slen = static_cast<ssize_t>(content_length);
-
-  if (r.first == -1) {
-    r.first = (std::max)(static_cast<ssize_t>(0), slen - r.second);
-    r.second = slen - 1;
-  }
-
-  if (r.second == -1) { r.second = slen - 1; }
-  return std::make_pair(r.first, static_cast<size_t>(r.second - r.first) + 1);
-}
-
-inline std::string make_content_range_header_field(size_t offset, size_t length,
-                                                   size_t content_length) {
-  std::string field = "bytes ";
-  field += std::to_string(offset);
-  field += "-";
-  field += std::to_string(offset + length - 1);
-  field += "/";
-  field += std::to_string(content_length);
-  return field;
-}
-
-template <typename SToken, typename CToken, typename Content>
-bool process_multipart_ranges_data(const Request &req, Response &res,
-                                   const std::string &boundary,
-                                   const std::string &content_type,
-                                   SToken stoken, CToken ctoken,
-                                   Content content) {
-  for (size_t i = 0; i < req.ranges.size(); i++) {
-    ctoken("--");
-    stoken(boundary);
-    ctoken("\r\n");
-    if (!content_type.empty()) {
-      ctoken("Content-Type: ");
-      stoken(content_type);
-      ctoken("\r\n");
-    }
-
-    auto offsets = get_range_offset_and_length(req, res.body.size(), i);
-    auto offset = offsets.first;
-    auto length = offsets.second;
-
-    ctoken("Content-Range: ");
-    stoken(make_content_range_header_field(offset, length, res.body.size()));
-    ctoken("\r\n");
-    ctoken("\r\n");
-    if (!content(offset, length)) { return false; }
-    ctoken("\r\n");
-  }
-
-  ctoken("--");
-  stoken(boundary);
-  ctoken("--\r\n");
-
-  return true;
-}
-
-inline bool make_multipart_ranges_data(const Request &req, Response &res,
-                                       const std::string &boundary,
-                                       const std::string &content_type,
-                                       std::string &data) {
-  return process_multipart_ranges_data(
-      req, res, boundary, content_type,
-      [&](const std::string &token) { data += token; },
-      [&](const std::string &token) { data += token; },
-      [&](size_t offset, size_t length) {
-        if (offset < res.body.size()) {
-          data += res.body.substr(offset, length);
-          return true;
-        }
-        return false;
-      });
-}
-
-inline size_t
-get_multipart_ranges_data_length(const Request &req, Response &res,
-                                 const std::string &boundary,
-                                 const std::string &content_type) {
-  size_t data_length = 0;
-
-  process_multipart_ranges_data(
-      req, res, boundary, content_type,
-      [&](const std::string &token) { data_length += token.size(); },
-      [&](const std::string &token) { data_length += token.size(); },
-      [&](size_t /*offset*/, size_t length) {
-        data_length += length;
-        return true;
-      });
-
-  return data_length;
-}
-
-template <typename T>
-inline bool write_multipart_ranges_data(Stream &strm, const Request &req,
-                                        Response &res,
-                                        const std::string &boundary,
-                                        const std::string &content_type,
-                                        const T &is_shutting_down) {
-  return process_multipart_ranges_data(
-      req, res, boundary, content_type,
-      [&](const std::string &token) { strm.write(token); },
-      [&](const std::string &token) { strm.write(token); },
-      [&](size_t offset, size_t length) {
-        return write_content(strm, res.content_provider_, offset, length,
-                             is_shutting_down);
-      });
-}
-
-inline std::pair<size_t, size_t>
-get_range_offset_and_length(const Request &req, const Response &res,
-                            size_t index) {
-  auto r = req.ranges[index];
-
-  if (r.second == -1) {
-    r.second = static_cast<ssize_t>(res.content_length_) - 1;
-  }
-
-  return std::make_pair(r.first, r.second - r.first + 1);
-}
-
-inline bool expect_content(const Request &req) {
-  if (req.method == "POST" || req.method == "PUT" || req.method == "PATCH" ||
-      req.method == "PRI" || req.method == "DELETE") {
-    return true;
-  }
-  // TODO: check if Content-Length is set
-  return false;
-}
-
-inline bool has_crlf(const std::string &s) {
-  auto p = s.c_str();
-  while (*p) {
-    if (*p == '\r' || *p == '\n') { return true; }
-    p++;
-  }
-  return false;
-}
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline std::string message_digest(const std::string &s, const EVP_MD *algo) {
-  auto context = std::unique_ptr<EVP_MD_CTX, decltype(&EVP_MD_CTX_free)>(
-      EVP_MD_CTX_new(), EVP_MD_CTX_free);
-
-  unsigned int hash_length = 0;
-  unsigned char hash[EVP_MAX_MD_SIZE];
-
-  EVP_DigestInit_ex(context.get(), algo, nullptr);
-  EVP_DigestUpdate(context.get(), s.c_str(), s.size());
-  EVP_DigestFinal_ex(context.get(), hash, &hash_length);
-
-  std::stringstream ss;
-  for (auto i = 0u; i < hash_length; ++i) {
-    ss << std::hex << std::setw(2) << std::setfill('0')
-       << (unsigned int)hash[i];
-  }
-
-  return ss.str();
-}
-
-inline std::string MD5(const std::string &s) {
-  return message_digest(s, EVP_md5());
-}
-
-inline std::string SHA_256(const std::string &s) {
-  return message_digest(s, EVP_sha256());
-}
-
-inline std::string SHA_512(const std::string &s) {
-  return message_digest(s, EVP_sha512());
-}
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-#ifdef _WIN32
-// NOTE: This code came up with the following stackoverflow post:
-// https://stackoverflow.com/questions/9507184/can-openssl-on-windows-use-the-system-certificate-store
-inline bool load_system_certs_on_windows(X509_STORE *store) {
-  auto hStore = CertOpenSystemStoreW((HCRYPTPROV_LEGACY)NULL, L"ROOT");
-  if (!hStore) { return false; }
-
-  auto result = false;
-  PCCERT_CONTEXT pContext = NULL;
-  while ((pContext = CertEnumCertificatesInStore(hStore, pContext)) !=
-         nullptr) {
-    auto encoded_cert =
-        static_cast<const unsigned char *>(pContext->pbCertEncoded);
-
-    auto x509 = d2i_X509(NULL, &encoded_cert, pContext->cbCertEncoded);
-    if (x509) {
-      X509_STORE_add_cert(store, x509);
-      X509_free(x509);
-      result = true;
-    }
-  }
-
-  CertFreeCertificateContext(pContext);
-  CertCloseStore(hStore, 0);
-
-  return result;
-}
-#elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && defined(__APPLE__)
-#if TARGET_OS_OSX
-template <typename T>
-using CFObjectPtr =
-    std::unique_ptr<typename std::remove_pointer<T>::type, void (*)(CFTypeRef)>;
-
-inline void cf_object_ptr_deleter(CFTypeRef obj) {
-  if (obj) { CFRelease(obj); }
-}
-
-inline bool retrieve_certs_from_keychain(CFObjectPtr<CFArrayRef> &certs) {
-  CFStringRef keys[] = {kSecClass, kSecMatchLimit, kSecReturnRef};
-  CFTypeRef values[] = {kSecClassCertificate, kSecMatchLimitAll,
-                        kCFBooleanTrue};
-
-  CFObjectPtr<CFDictionaryRef> query(
-      CFDictionaryCreate(nullptr, reinterpret_cast<const void **>(keys), values,
-                         sizeof(keys) / sizeof(keys[0]),
-                         &kCFTypeDictionaryKeyCallBacks,
-                         &kCFTypeDictionaryValueCallBacks),
-      cf_object_ptr_deleter);
-
-  if (!query) { return false; }
-
-  CFTypeRef security_items = nullptr;
-  if (SecItemCopyMatching(query.get(), &security_items) != errSecSuccess ||
-      CFArrayGetTypeID() != CFGetTypeID(security_items)) {
-    return false;
-  }
-
-  certs.reset(reinterpret_cast<CFArrayRef>(security_items));
-  return true;
-}
-
-inline bool retrieve_root_certs_from_keychain(CFObjectPtr<CFArrayRef> &certs) {
-  CFArrayRef root_security_items = nullptr;
-  if (SecTrustCopyAnchorCertificates(&root_security_items) != errSecSuccess) {
-    return false;
-  }
-
-  certs.reset(root_security_items);
-  return true;
-}
-
-inline bool add_certs_to_x509_store(CFArrayRef certs, X509_STORE *store) {
-  auto result = false;
-  for (int i = 0; i < CFArrayGetCount(certs); ++i) {
-    const auto cert = reinterpret_cast<const __SecCertificate *>(
-        CFArrayGetValueAtIndex(certs, i));
-
-    if (SecCertificateGetTypeID() != CFGetTypeID(cert)) { continue; }
-
-    CFDataRef cert_data = nullptr;
-    if (SecItemExport(cert, kSecFormatX509Cert, 0, nullptr, &cert_data) !=
-        errSecSuccess) {
-      continue;
-    }
-
-    CFObjectPtr<CFDataRef> cert_data_ptr(cert_data, cf_object_ptr_deleter);
-
-    auto encoded_cert = static_cast<const unsigned char *>(
-        CFDataGetBytePtr(cert_data_ptr.get()));
-
-    auto x509 =
-        d2i_X509(NULL, &encoded_cert, CFDataGetLength(cert_data_ptr.get()));
-
-    if (x509) {
-      X509_STORE_add_cert(store, x509);
-      X509_free(x509);
-      result = true;
-    }
-  }
-
-  return result;
-}
-
-inline bool load_system_certs_on_macos(X509_STORE *store) {
-  auto result = false;
-  CFObjectPtr<CFArrayRef> certs(nullptr, cf_object_ptr_deleter);
-  if (retrieve_certs_from_keychain(certs) && certs) {
-    result = add_certs_to_x509_store(certs.get(), store);
-  }
-
-  if (retrieve_root_certs_from_keychain(certs) && certs) {
-    result = add_certs_to_x509_store(certs.get(), store) || result;
-  }
-
-  return result;
-}
-#endif // TARGET_OS_OSX
-#endif // _WIN32
-#endif // CPPHTTPLIB_OPENSSL_SUPPORT
-
-#ifdef _WIN32
-class WSInit {
-public:
-  WSInit() {
-    WSADATA wsaData;
-    if (WSAStartup(0x0002, &wsaData) == 0) is_valid_ = true;
-  }
-
-  ~WSInit() {
-    if (is_valid_) WSACleanup();
-  }
-
-  bool is_valid_ = false;
-};
-
-static WSInit wsinit_;
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline std::pair<std::string, std::string> make_digest_authentication_header(
-    const Request &req, const std::map<std::string, std::string> &auth,
-    size_t cnonce_count, const std::string &cnonce, const std::string &username,
-    const std::string &password, bool is_proxy = false) {
-  std::string nc;
-  {
-    std::stringstream ss;
-    ss << std::setfill('0') << std::setw(8) << std::hex << cnonce_count;
-    nc = ss.str();
-  }
-
-  std::string qop;
-  if (auth.find("qop") != auth.end()) {
-    qop = auth.at("qop");
-    if (qop.find("auth-int") != std::string::npos) {
-      qop = "auth-int";
-    } else if (qop.find("auth") != std::string::npos) {
-      qop = "auth";
-    } else {
-      qop.clear();
-    }
-  }
-
-  std::string algo = "MD5";
-  if (auth.find("algorithm") != auth.end()) { algo = auth.at("algorithm"); }
-
-  std::string response;
-  {
-    auto H = algo == "SHA-256"   ? detail::SHA_256
-             : algo == "SHA-512" ? detail::SHA_512
-                                 : detail::MD5;
-
-    auto A1 = username + ":" + auth.at("realm") + ":" + password;
-
-    auto A2 = req.method + ":" + req.path;
-    if (qop == "auth-int") { A2 += ":" + H(req.body); }
-
-    if (qop.empty()) {
-      response = H(H(A1) + ":" + auth.at("nonce") + ":" + H(A2));
-    } else {
-      response = H(H(A1) + ":" + auth.at("nonce") + ":" + nc + ":" + cnonce +
-                   ":" + qop + ":" + H(A2));
-    }
-  }
-
-  auto opaque = (auth.find("opaque") != auth.end()) ? auth.at("opaque") : "";
-
-  auto field = "Digest username=\"" + username + "\", realm=\"" +
-               auth.at("realm") + "\", nonce=\"" + auth.at("nonce") +
-               "\", uri=\"" + req.path + "\", algorithm=" + algo +
-               (qop.empty() ? ", response=\""
-                            : ", qop=" + qop + ", nc=" + nc + ", cnonce=\"" +
-                                  cnonce + "\", response=\"") +
-               response + "\"" +
-               (opaque.empty() ? "" : ", opaque=\"" + opaque + "\"");
-
-  auto key = is_proxy ? "Proxy-Authorization" : "Authorization";
-  return std::make_pair(key, field);
-}
-#endif
-
-inline bool parse_www_authenticate(const Response &res,
-                                   std::map<std::string, std::string> &auth,
-                                   bool is_proxy) {
-  auto auth_key = is_proxy ? "Proxy-Authenticate" : "WWW-Authenticate";
-  if (res.has_header(auth_key)) {
-    static auto re = std::regex(R"~((?:(?:,\s*)?(.+?)=(?:"(.*?)"|([^,]*))))~");
-    auto s = res.get_header_value(auth_key);
-    auto pos = s.find(' ');
-    if (pos != std::string::npos) {
-      auto type = s.substr(0, pos);
-      if (type == "Basic") {
-        return false;
-      } else if (type == "Digest") {
-        s = s.substr(pos + 1);
-        auto beg = std::sregex_iterator(s.begin(), s.end(), re);
-        for (auto i = beg; i != std::sregex_iterator(); ++i) {
-          auto m = *i;
-          auto key = s.substr(static_cast<size_t>(m.position(1)),
-                              static_cast<size_t>(m.length(1)));
-          auto val = m.length(2) > 0
-                         ? s.substr(static_cast<size_t>(m.position(2)),
-                                    static_cast<size_t>(m.length(2)))
-                         : s.substr(static_cast<size_t>(m.position(3)),
-                                    static_cast<size_t>(m.length(3)));
-          auth[key] = val;
-        }
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-// https://stackoverflow.com/questions/440133/how-do-i-create-a-random-alpha-numeric-string-in-c/440240#answer-440240
-inline std::string random_string(size_t length) {
-  auto randchar = []() -> char {
-    const char charset[] = "0123456789"
-                           "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-                           "abcdefghijklmnopqrstuvwxyz";
-    const size_t max_index = (sizeof(charset) - 1);
-    return charset[static_cast<size_t>(std::rand()) % max_index];
-  };
-  std::string str(length, 0);
-  std::generate_n(str.begin(), length, randchar);
-  return str;
-}
-
-class ContentProviderAdapter {
-public:
-  explicit ContentProviderAdapter(
-      ContentProviderWithoutLength &&content_provider)
-      : content_provider_(content_provider) {}
-
-  bool operator()(size_t offset, size_t, DataSink &sink) {
-    return content_provider_(offset, sink);
-  }
-
-private:
-  ContentProviderWithoutLength content_provider_;
-};
-
-} // namespace detail
-
-inline std::string hosted_at(const std::string &hostname) {
-  std::vector<std::string> addrs;
-  hosted_at(hostname, addrs);
-  if (addrs.empty()) { return std::string(); }
-  return addrs[0];
-}
-
-inline void hosted_at(const std::string &hostname,
-                      std::vector<std::string> &addrs) {
-  struct addrinfo hints;
-  struct addrinfo *result;
-
-  memset(&hints, 0, sizeof(struct addrinfo));
-  hints.ai_family = AF_UNSPEC;
-  hints.ai_socktype = SOCK_STREAM;
-  hints.ai_protocol = 0;
-
-  if (getaddrinfo(hostname.c_str(), nullptr, &hints, &result)) {
-#if defined __linux__ && !defined __ANDROID__
-    res_init();
-#endif
-    return;
-  }
-
-  for (auto rp = result; rp; rp = rp->ai_next) {
-    const auto &addr =
-        *reinterpret_cast<struct sockaddr_storage *>(rp->ai_addr);
-    std::string ip;
-    int dummy = -1;
-    if (detail::get_ip_and_port(addr, sizeof(struct sockaddr_storage), ip,
-                                dummy)) {
-      addrs.push_back(ip);
-    }
-  }
-
-  freeaddrinfo(result);
-}
-
-inline std::string append_query_params(const std::string &path,
-                                       const Params &params) {
-  std::string path_with_query = path;
-  const static std::regex re("[^?]+\\?.*");
-  auto delm = std::regex_match(path, re) ? '&' : '?';
-  path_with_query += delm + detail::params_to_query_str(params);
-  return path_with_query;
-}
-
-// Header utilities
-inline std::pair<std::string, std::string> make_range_header(Ranges ranges) {
-  std::string field = "bytes=";
-  auto i = 0;
-  for (auto r : ranges) {
-    if (i != 0) { field += ", "; }
-    if (r.first != -1) { field += std::to_string(r.first); }
-    field += '-';
-    if (r.second != -1) { field += std::to_string(r.second); }
-    i++;
-  }
-  return std::make_pair("Range", std::move(field));
-}
-
-inline std::pair<std::string, std::string>
-make_basic_authentication_header(const std::string &username,
-                                 const std::string &password, bool is_proxy) {
-  auto field = "Basic " + detail::base64_encode(username + ":" + password);
-  auto key = is_proxy ? "Proxy-Authorization" : "Authorization";
-  return std::make_pair(key, std::move(field));
-}
-
-inline std::pair<std::string, std::string>
-make_bearer_token_authentication_header(const std::string &token,
-                                        bool is_proxy = false) {
-  auto field = "Bearer " + token;
-  auto key = is_proxy ? "Proxy-Authorization" : "Authorization";
-  return std::make_pair(key, std::move(field));
-}
-
-// Request implementation
-inline bool Request::has_header(const std::string &key) const {
-  return detail::has_header(headers, key);
-}
-
-inline std::string Request::get_header_value(const std::string &key,
-                                             size_t id) const {
-  return detail::get_header_value(headers, key, id, "");
-}
-
-inline size_t Request::get_header_value_count(const std::string &key) const {
-  auto r = headers.equal_range(key);
-  return static_cast<size_t>(std::distance(r.first, r.second));
-}
-
-inline void Request::set_header(const std::string &key,
-                                const std::string &val) {
-  if (!detail::has_crlf(key) && !detail::has_crlf(val)) {
-    headers.emplace(key, val);
-  }
-}
-
-inline bool Request::has_param(const std::string &key) const {
-  return params.find(key) != params.end();
-}
-
-inline std::string Request::get_param_value(const std::string &key,
-                                            size_t id) const {
-  auto rng = params.equal_range(key);
-  auto it = rng.first;
-  std::advance(it, static_cast<ssize_t>(id));
-  if (it != rng.second) { return it->second; }
-  return std::string();
-}
-
-inline size_t Request::get_param_value_count(const std::string &key) const {
-  auto r = params.equal_range(key);
-  return static_cast<size_t>(std::distance(r.first, r.second));
-}
-
-inline bool Request::is_multipart_form_data() const {
-  const auto &content_type = get_header_value("Content-Type");
-  return !content_type.rfind("multipart/form-data", 0);
-}
-
-inline bool Request::has_file(const std::string &key) const {
-  return files.find(key) != files.end();
-}
-
-inline MultipartFormData Request::get_file_value(const std::string &key) const {
-  auto it = files.find(key);
-  if (it != files.end()) { return it->second; }
-  return MultipartFormData();
-}
-
-inline std::vector<MultipartFormData>
-Request::get_file_values(const std::string &key) const {
-  std::vector<MultipartFormData> values;
-  auto rng = files.equal_range(key);
-  for (auto it = rng.first; it != rng.second; it++) {
-    values.push_back(it->second);
-  }
-  return values;
-}
-
-// Response implementation
-inline bool Response::has_header(const std::string &key) const {
-  return headers.find(key) != headers.end();
-}
-
-inline std::string Response::get_header_value(const std::string &key,
-                                              size_t id) const {
-  return detail::get_header_value(headers, key, id, "");
-}
-
-inline size_t Response::get_header_value_count(const std::string &key) const {
-  auto r = headers.equal_range(key);
-  return static_cast<size_t>(std::distance(r.first, r.second));
-}
-
-inline void Response::set_header(const std::string &key,
-                                 const std::string &val) {
-  if (!detail::has_crlf(key) && !detail::has_crlf(val)) {
-    headers.emplace(key, val);
-  }
-}
-
-inline void Response::set_redirect(const std::string &url, int stat) {
-  if (!detail::has_crlf(url)) {
-    set_header("Location", url);
-    if (300 <= stat && stat < 400) {
-      this->status = stat;
-    } else {
-      this->status = 302;
-    }
-  }
-}
-
-inline void Response::set_content(const char *s, size_t n,
-                                  const std::string &content_type) {
-  body.assign(s, n);
-
-  auto rng = headers.equal_range("Content-Type");
-  headers.erase(rng.first, rng.second);
-  set_header("Content-Type", content_type);
-}
-
-inline void Response::set_content(const std::string &s,
-                                  const std::string &content_type) {
-  set_content(s.data(), s.size(), content_type);
-}
-
-inline void Response::set_content_provider(
-    size_t in_length, const std::string &content_type, ContentProvider provider,
-    ContentProviderResourceReleaser resource_releaser) {
-  set_header("Content-Type", content_type);
-  content_length_ = in_length;
-  if (in_length > 0) { content_provider_ = std::move(provider); }
-  content_provider_resource_releaser_ = resource_releaser;
-  is_chunked_content_provider_ = false;
-}
-
-inline void Response::set_content_provider(
-    const std::string &content_type, ContentProviderWithoutLength provider,
-    ContentProviderResourceReleaser resource_releaser) {
-  set_header("Content-Type", content_type);
-  content_length_ = 0;
-  content_provider_ = detail::ContentProviderAdapter(std::move(provider));
-  content_provider_resource_releaser_ = resource_releaser;
-  is_chunked_content_provider_ = false;
-}
-
-inline void Response::set_chunked_content_provider(
-    const std::string &content_type, ContentProviderWithoutLength provider,
-    ContentProviderResourceReleaser resource_releaser) {
-  set_header("Content-Type", content_type);
-  content_length_ = 0;
-  content_provider_ = detail::ContentProviderAdapter(std::move(provider));
-  content_provider_resource_releaser_ = resource_releaser;
-  is_chunked_content_provider_ = true;
-}
-
-// Result implementation
-inline bool Result::has_request_header(const std::string &key) const {
-  return request_headers_.find(key) != request_headers_.end();
-}
-
-inline std::string Result::get_request_header_value(const std::string &key,
-                                                    size_t id) const {
-  return detail::get_header_value(request_headers_, key, id, "");
-}
-
-inline size_t
-Result::get_request_header_value_count(const std::string &key) const {
-  auto r = request_headers_.equal_range(key);
-  return static_cast<size_t>(std::distance(r.first, r.second));
-}
-
-// Stream implementation
-inline ssize_t Stream::write(const char *ptr) {
-  return write(ptr, strlen(ptr));
-}
-
-inline ssize_t Stream::write(const std::string &s) {
-  return write(s.data(), s.size());
-}
-
-namespace detail {
-
-// Socket stream implementation
-inline SocketStream::SocketStream(socket_t sock, time_t read_timeout_sec,
-                                  time_t read_timeout_usec,
-                                  time_t write_timeout_sec,
-                                  time_t write_timeout_usec)
-    : sock_(sock), read_timeout_sec_(read_timeout_sec),
-      read_timeout_usec_(read_timeout_usec),
-      write_timeout_sec_(write_timeout_sec),
-      write_timeout_usec_(write_timeout_usec), read_buff_(read_buff_size_, 0) {}
-
-inline SocketStream::~SocketStream() {}
-
-inline bool SocketStream::is_readable() const {
-  return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0;
-}
-
-inline bool SocketStream::is_writable() const {
-  return select_write(sock_, write_timeout_sec_, write_timeout_usec_) > 0 &&
-         is_socket_alive(sock_);
-}
-
-inline ssize_t SocketStream::read(char *ptr, size_t size) {
-#ifdef _WIN32
-  size =
-      (std::min)(size, static_cast<size_t>((std::numeric_limits<int>::max)()));
-#else
-  size = (std::min)(size,
-                    static_cast<size_t>((std::numeric_limits<ssize_t>::max)()));
-#endif
-
-  if (read_buff_off_ < read_buff_content_size_) {
-    auto remaining_size = read_buff_content_size_ - read_buff_off_;
-    if (size <= remaining_size) {
-      memcpy(ptr, read_buff_.data() + read_buff_off_, size);
-      read_buff_off_ += size;
-      return static_cast<ssize_t>(size);
-    } else {
-      memcpy(ptr, read_buff_.data() + read_buff_off_, remaining_size);
-      read_buff_off_ += remaining_size;
-      return static_cast<ssize_t>(remaining_size);
-    }
-  }
-
-  if (!is_readable()) { return -1; }
-
-  read_buff_off_ = 0;
-  read_buff_content_size_ = 0;
-
-  if (size < read_buff_size_) {
-    auto n = read_socket(sock_, read_buff_.data(), read_buff_size_,
-                         CPPHTTPLIB_RECV_FLAGS);
-    if (n <= 0) {
-      return n;
-    } else if (n <= static_cast<ssize_t>(size)) {
-      memcpy(ptr, read_buff_.data(), static_cast<size_t>(n));
-      return n;
-    } else {
-      memcpy(ptr, read_buff_.data(), size);
-      read_buff_off_ = size;
-      read_buff_content_size_ = static_cast<size_t>(n);
-      return static_cast<ssize_t>(size);
-    }
-  } else {
-    return read_socket(sock_, ptr, size, CPPHTTPLIB_RECV_FLAGS);
-  }
-}
-
-inline ssize_t SocketStream::write(const char *ptr, size_t size) {
-  if (!is_writable()) { return -1; }
-
-#if defined(_WIN32) && !defined(_WIN64)
-  size =
-      (std::min)(size, static_cast<size_t>((std::numeric_limits<int>::max)()));
-#endif
-
-  return send_socket(sock_, ptr, size, CPPHTTPLIB_SEND_FLAGS);
-}
-
-inline void SocketStream::get_remote_ip_and_port(std::string &ip,
-                                                 int &port) const {
-  return detail::get_remote_ip_and_port(sock_, ip, port);
-}
-
-inline void SocketStream::get_local_ip_and_port(std::string &ip,
-                                                int &port) const {
-  return detail::get_local_ip_and_port(sock_, ip, port);
-}
-
-inline socket_t SocketStream::socket() const { return sock_; }
-
-// Buffer stream implementation
-inline bool BufferStream::is_readable() const { return true; }
-
-inline bool BufferStream::is_writable() const { return true; }
-
-inline ssize_t BufferStream::read(char *ptr, size_t size) {
-#if defined(_MSC_VER) && _MSC_VER < 1910
-  auto len_read = buffer._Copy_s(ptr, size, size, position);
-#else
-  auto len_read = buffer.copy(ptr, size, position);
-#endif
-  position += static_cast<size_t>(len_read);
-  return static_cast<ssize_t>(len_read);
-}
-
-inline ssize_t BufferStream::write(const char *ptr, size_t size) {
-  buffer.append(ptr, size);
-  return static_cast<ssize_t>(size);
-}
-
-inline void BufferStream::get_remote_ip_and_port(std::string & /*ip*/,
-                                                 int & /*port*/) const {}
-
-inline void BufferStream::get_local_ip_and_port(std::string & /*ip*/,
-                                                int & /*port*/) const {}
-
-inline socket_t BufferStream::socket() const { return 0; }
-
-inline const std::string &BufferStream::get_buffer() const { return buffer; }
-
-} // namespace detail
-
-// HTTP server implementation
-inline Server::Server()
-    : new_task_queue(
-          [] { return new ThreadPool(CPPHTTPLIB_THREAD_POOL_COUNT); }) {
-#ifndef _WIN32
-  signal(SIGPIPE, SIG_IGN);
-#endif
-}
-
-inline Server::~Server() {}
-
-inline Server &Server::Get(const std::string &pattern, Handler handler) {
-  get_handlers_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
-  return *this;
-}
-
-inline Server &Server::Post(const std::string &pattern, Handler handler) {
-  post_handlers_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
-  return *this;
-}
-
-inline Server &Server::Post(const std::string &pattern,
-                            HandlerWithContentReader handler) {
-  post_handlers_for_content_reader_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
-  return *this;
-}
-
-inline Server &Server::Put(const std::string &pattern, Handler handler) {
-  put_handlers_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
-  return *this;
-}
-
-inline Server &Server::Put(const std::string &pattern,
-                           HandlerWithContentReader handler) {
-  put_handlers_for_content_reader_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
-  return *this;
-}
-
-inline Server &Server::Patch(const std::string &pattern, Handler handler) {
-  patch_handlers_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
-  return *this;
-}
-
-inline Server &Server::Patch(const std::string &pattern,
-                             HandlerWithContentReader handler) {
-  patch_handlers_for_content_reader_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
-  return *this;
-}
-
-inline Server &Server::Delete(const std::string &pattern, Handler handler) {
-  delete_handlers_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
-  return *this;
-}
-
-inline Server &Server::Delete(const std::string &pattern,
-                              HandlerWithContentReader handler) {
-  delete_handlers_for_content_reader_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
-  return *this;
-}
-
-inline Server &Server::Options(const std::string &pattern, Handler handler) {
-  options_handlers_.push_back(
-      std::make_pair(std::regex(pattern), std::move(handler)));
-  return *this;
-}
-
-inline bool Server::set_base_dir(const std::string &dir,
-                                 const std::string &mount_point) {
-  return set_mount_point(mount_point, dir);
-}
-
-inline bool Server::set_mount_point(const std::string &mount_point,
-                                    const std::string &dir, Headers headers) {
-  if (detail::is_dir(dir)) {
-    std::string mnt = !mount_point.empty() ? mount_point : "/";
-    if (!mnt.empty() && mnt[0] == '/') {
-      base_dirs_.push_back({mnt, dir, std::move(headers)});
-      return true;
-    }
-  }
-  return false;
-}
-
-inline bool Server::remove_mount_point(const std::string &mount_point) {
-  for (auto it = base_dirs_.begin(); it != base_dirs_.end(); ++it) {
-    if (it->mount_point == mount_point) {
-      base_dirs_.erase(it);
-      return true;
-    }
-  }
-  return false;
-}
-
-inline Server &
-Server::set_file_extension_and_mimetype_mapping(const std::string &ext,
-                                                const std::string &mime) {
-  file_extension_and_mimetype_map_[ext] = mime;
-  return *this;
-}
-
-inline Server &Server::set_file_request_handler(Handler handler) {
-  file_request_handler_ = std::move(handler);
-  return *this;
-}
-
-inline Server &Server::set_error_handler(HandlerWithResponse handler) {
-  error_handler_ = std::move(handler);
-  return *this;
-}
-
-inline Server &Server::set_error_handler(Handler handler) {
-  error_handler_ = [handler](const Request &req, Response &res) {
-    handler(req, res);
-    return HandlerResponse::Handled;
-  };
-  return *this;
-}
-
-inline Server &Server::set_exception_handler(ExceptionHandler handler) {
-  exception_handler_ = std::move(handler);
-  return *this;
-}
-
-inline Server &Server::set_pre_routing_handler(HandlerWithResponse handler) {
-  pre_routing_handler_ = std::move(handler);
-  return *this;
-}
-
-inline Server &Server::set_post_routing_handler(Handler handler) {
-  post_routing_handler_ = std::move(handler);
-  return *this;
-}
-
-inline Server &Server::set_logger(Logger logger) {
-  logger_ = std::move(logger);
-  return *this;
-}
-
-inline Server &
-Server::set_expect_100_continue_handler(Expect100ContinueHandler handler) {
-  expect_100_continue_handler_ = std::move(handler);
-
-  return *this;
-}
-
-inline Server &Server::set_address_family(int family) {
-  address_family_ = family;
-  return *this;
-}
-
-inline Server &Server::set_tcp_nodelay(bool on) {
-  tcp_nodelay_ = on;
-  return *this;
-}
-
-inline Server &Server::set_socket_options(SocketOptions socket_options) {
-  socket_options_ = std::move(socket_options);
-  return *this;
-}
-
-inline Server &Server::set_default_headers(Headers headers) {
-  default_headers_ = std::move(headers);
-  return *this;
-}
-
-inline Server &Server::set_keep_alive_max_count(size_t count) {
-  keep_alive_max_count_ = count;
-  return *this;
-}
-
-inline Server &Server::set_keep_alive_timeout(time_t sec) {
-  keep_alive_timeout_sec_ = sec;
-  return *this;
-}
-
-inline Server &Server::set_read_timeout(time_t sec, time_t usec) {
-  read_timeout_sec_ = sec;
-  read_timeout_usec_ = usec;
-  return *this;
-}
-
-inline Server &Server::set_write_timeout(time_t sec, time_t usec) {
-  write_timeout_sec_ = sec;
-  write_timeout_usec_ = usec;
-  return *this;
-}
-
-inline Server &Server::set_idle_interval(time_t sec, time_t usec) {
-  idle_interval_sec_ = sec;
-  idle_interval_usec_ = usec;
-  return *this;
-}
-
-inline Server &Server::set_payload_max_length(size_t length) {
-  payload_max_length_ = length;
-  return *this;
-}
-
-inline bool Server::bind_to_port(const std::string &host, int port,
-                                 int socket_flags) {
-  if (bind_internal(host, port, socket_flags) < 0) return false;
-  return true;
-}
-inline int Server::bind_to_any_port(const std::string &host, int socket_flags) {
-  return bind_internal(host, 0, socket_flags);
-}
-
-inline bool Server::listen_after_bind() {
-  auto se = detail::scope_exit([&]() { done_ = true; });
-  return listen_internal();
-}
-
-inline bool Server::listen(const std::string &host, int port,
-                           int socket_flags) {
-  auto se = detail::scope_exit([&]() { done_ = true; });
-  return bind_to_port(host, port, socket_flags) && listen_internal();
-}
-
-inline bool Server::is_running() const { return is_running_; }
-
-inline void Server::wait_until_ready() const {
-  while (!is_running() && !done_) {
-    std::this_thread::sleep_for(std::chrono::milliseconds{1});
-  }
-}
-
-inline void Server::stop() {
-  if (is_running_) {
-    assert(svr_sock_ != INVALID_SOCKET);
-    std::atomic<socket_t> sock(svr_sock_.exchange(INVALID_SOCKET));
-    detail::shutdown_socket(sock);
-    detail::close_socket(sock);
-  }
-}
-
-inline bool Server::parse_request_line(const char *s, Request &req) {
-  auto len = strlen(s);
-  if (len < 2 || s[len - 2] != '\r' || s[len - 1] != '\n') { return false; }
-  len -= 2;
-
-  {
-    size_t count = 0;
-
-    detail::split(s, s + len, ' ', [&](const char *b, const char *e) {
-      switch (count) {
-      case 0: req.method = std::string(b, e); break;
-      case 1: req.target = std::string(b, e); break;
-      case 2: req.version = std::string(b, e); break;
-      default: break;
-      }
-      count++;
-    });
-
-    if (count != 3) { return false; }
-  }
-
-  static const std::set<std::string> methods{
-      "GET",     "HEAD",    "POST",  "PUT",   "DELETE",
-      "CONNECT", "OPTIONS", "TRACE", "PATCH", "PRI"};
-
-  if (methods.find(req.method) == methods.end()) { return false; }
-
-  if (req.version != "HTTP/1.1" && req.version != "HTTP/1.0") { return false; }
-
-  {
-    // Skip URL fragment
-    for (size_t i = 0; i < req.target.size(); i++) {
-      if (req.target[i] == '#') {
-        req.target.erase(i);
-        break;
-      }
-    }
-
-    size_t count = 0;
-
-    detail::split(req.target.data(), req.target.data() + req.target.size(), '?',
-                  [&](const char *b, const char *e) {
-                    switch (count) {
-                    case 0:
-                      req.path = detail::decode_url(std::string(b, e), false);
-                      break;
-                    case 1: {
-                      if (e - b > 0) {
-                        detail::parse_query_text(std::string(b, e), req.params);
-                      }
-                      break;
-                    }
-                    default: break;
-                    }
-                    count++;
-                  });
-
-    if (count > 2) { return false; }
-  }
-
-  return true;
-}
-
-inline bool Server::write_response(Stream &strm, bool close_connection,
-                                   const Request &req, Response &res) {
-  return write_response_core(strm, close_connection, req, res, false);
-}
-
-inline bool Server::write_response_with_content(Stream &strm,
-                                                bool close_connection,
-                                                const Request &req,
-                                                Response &res) {
-  return write_response_core(strm, close_connection, req, res, true);
-}
-
-inline bool Server::write_response_core(Stream &strm, bool close_connection,
-                                        const Request &req, Response &res,
-                                        bool need_apply_ranges) {
-  assert(res.status != -1);
-
-  if (400 <= res.status && error_handler_ &&
-      error_handler_(req, res) == HandlerResponse::Handled) {
-    need_apply_ranges = true;
-  }
-
-  std::string content_type;
-  std::string boundary;
-  if (need_apply_ranges) { apply_ranges(req, res, content_type, boundary); }
-
-  // Prepare additional headers
-  if (close_connection || req.get_header_value("Connection") == "close") {
-    res.set_header("Connection", "close");
-  } else {
-    std::stringstream ss;
-    ss << "timeout=" << keep_alive_timeout_sec_
-       << ", max=" << keep_alive_max_count_;
-    res.set_header("Keep-Alive", ss.str());
-  }
-
-  if (!res.has_header("Content-Type") &&
-      (!res.body.empty() || res.content_length_ > 0 || res.content_provider_)) {
-    res.set_header("Content-Type", "text/plain");
-  }
-
-  if (!res.has_header("Content-Length") && res.body.empty() &&
-      !res.content_length_ && !res.content_provider_) {
-    res.set_header("Content-Length", "0");
-  }
-
-  if (!res.has_header("Accept-Ranges") && req.method == "HEAD") {
-    res.set_header("Accept-Ranges", "bytes");
-  }
-
-  if (post_routing_handler_) { post_routing_handler_(req, res); }
-
-  // Response line and headers
-  {
-    detail::BufferStream bstrm;
-
-    if (!bstrm.write_format("HTTP/1.1 %d %s\r\n", res.status,
-                            detail::status_message(res.status))) {
-      return false;
-    }
-
-    if (!detail::write_headers(bstrm, res.headers)) { return false; }
-
-    // Flush buffer
-    auto &data = bstrm.get_buffer();
-    detail::write_data(strm, data.data(), data.size());
-  }
-
-  // Body
-  auto ret = true;
-  if (req.method != "HEAD") {
-    if (!res.body.empty()) {
-      if (!detail::write_data(strm, res.body.data(), res.body.size())) {
-        ret = false;
-      }
-    } else if (res.content_provider_) {
-      if (write_content_with_provider(strm, req, res, boundary, content_type)) {
-        res.content_provider_success_ = true;
-      } else {
-        res.content_provider_success_ = false;
-        ret = false;
-      }
-    }
-  }
-
-  // Log
-  if (logger_) { logger_(req, res); }
-
-  return ret;
-}
-
-inline bool
-Server::write_content_with_provider(Stream &strm, const Request &req,
-                                    Response &res, const std::string &boundary,
-                                    const std::string &content_type) {
-  auto is_shutting_down = [this]() {
-    return this->svr_sock_ == INVALID_SOCKET;
-  };
-
-  if (res.content_length_ > 0) {
-    if (req.ranges.empty()) {
-      return detail::write_content(strm, res.content_provider_, 0,
-                                   res.content_length_, is_shutting_down);
-    } else if (req.ranges.size() == 1) {
-      auto offsets =
-          detail::get_range_offset_and_length(req, res.content_length_, 0);
-      auto offset = offsets.first;
-      auto length = offsets.second;
-      return detail::write_content(strm, res.content_provider_, offset, length,
-                                   is_shutting_down);
-    } else {
-      return detail::write_multipart_ranges_data(
-          strm, req, res, boundary, content_type, is_shutting_down);
-    }
-  } else {
-    if (res.is_chunked_content_provider_) {
-      auto type = detail::encoding_type(req, res);
-
-      std::unique_ptr<detail::compressor> compressor;
-      if (type == detail::EncodingType::Gzip) {
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-        compressor = detail::make_unique<detail::gzip_compressor>();
-#endif
-      } else if (type == detail::EncodingType::Brotli) {
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-        compressor = detail::make_unique<detail::brotli_compressor>();
-#endif
-      } else {
-        compressor = detail::make_unique<detail::nocompressor>();
-      }
-      assert(compressor != nullptr);
-
-      return detail::write_content_chunked(strm, res.content_provider_,
-                                           is_shutting_down, *compressor);
-    } else {
-      return detail::write_content_without_length(strm, res.content_provider_,
-                                                  is_shutting_down);
-    }
-  }
-}
-
-inline bool Server::read_content(Stream &strm, Request &req, Response &res) {
-  MultipartFormDataMap::iterator cur;
-  auto file_count = 0;
-  if (read_content_core(
-          strm, req, res,
-          // Regular
-          [&](const char *buf, size_t n) {
-            if (req.body.size() + n > req.body.max_size()) { return false; }
-            req.body.append(buf, n);
-            return true;
-          },
-          // Multipart
-          [&](const MultipartFormData &file) {
-            if (file_count++ == CPPHTTPLIB_MULTIPART_FORM_DATA_FILE_MAX_COUNT) {
-              return false;
-            }
-            cur = req.files.emplace(file.name, file);
-            return true;
-          },
-          [&](const char *buf, size_t n) {
-            auto &content = cur->second.content;
-            if (content.size() + n > content.max_size()) { return false; }
-            content.append(buf, n);
-            return true;
-          })) {
-    const auto &content_type = req.get_header_value("Content-Type");
-    if (!content_type.find("application/x-www-form-urlencoded")) {
-      if (req.body.size() > CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH) {
-        res.status = 413; // NOTE: should be 414?
-        return false;
-      }
-      detail::parse_query_text(req.body, req.params);
-    }
-    return true;
-  }
-  return false;
-}
-
-inline bool Server::read_content_with_content_receiver(
-    Stream &strm, Request &req, Response &res, ContentReceiver receiver,
-    MultipartContentHeader multipart_header,
-    ContentReceiver multipart_receiver) {
-  return read_content_core(strm, req, res, std::move(receiver),
-                           std::move(multipart_header),
-                           std::move(multipart_receiver));
-}
-
-inline bool Server::read_content_core(Stream &strm, Request &req, Response &res,
-                                      ContentReceiver receiver,
-                                      MultipartContentHeader multipart_header,
-                                      ContentReceiver multipart_receiver) {
-  detail::MultipartFormDataParser multipart_form_data_parser;
-  ContentReceiverWithProgress out;
-
-  if (req.is_multipart_form_data()) {
-    const auto &content_type = req.get_header_value("Content-Type");
-    std::string boundary;
-    if (!detail::parse_multipart_boundary(content_type, boundary)) {
-      res.status = 400;
-      return false;
-    }
-
-    multipart_form_data_parser.set_boundary(std::move(boundary));
-    out = [&](const char *buf, size_t n, uint64_t /*off*/, uint64_t /*len*/) {
-      /* For debug
-      size_t pos = 0;
-      while (pos < n) {
-        auto read_size = (std::min)<size_t>(1, n - pos);
-        auto ret = multipart_form_data_parser.parse(
-            buf + pos, read_size, multipart_receiver, multipart_header);
-        if (!ret) { return false; }
-        pos += read_size;
-      }
-      return true;
-      */
-      return multipart_form_data_parser.parse(buf, n, multipart_receiver,
-                                              multipart_header);
-    };
-  } else {
-    out = [receiver](const char *buf, size_t n, uint64_t /*off*/,
-                     uint64_t /*len*/) { return receiver(buf, n); };
-  }
-
-  if (req.method == "DELETE" && !req.has_header("Content-Length")) {
-    return true;
-  }
-
-  if (!detail::read_content(strm, req, payload_max_length_, res.status, nullptr,
-                            out, true)) {
-    return false;
-  }
-
-  if (req.is_multipart_form_data()) {
-    if (!multipart_form_data_parser.is_valid()) {
-      res.status = 400;
-      return false;
-    }
-  }
-
-  return true;
-}
-
-inline bool Server::handle_file_request(const Request &req, Response &res,
-                                        bool head) {
-  for (const auto &entry : base_dirs_) {
-    // Prefix match
-    if (!req.path.compare(0, entry.mount_point.size(), entry.mount_point)) {
-      std::string sub_path = "/" + req.path.substr(entry.mount_point.size());
-      if (detail::is_valid_path(sub_path)) {
-        auto path = entry.base_dir + sub_path;
-        if (path.back() == '/') { path += "index.html"; }
-
-        if (detail::is_file(path)) {
-          detail::read_file(path, res.body);
-          auto type =
-              detail::find_content_type(path, file_extension_and_mimetype_map_);
-          if (type) { res.set_header("Content-Type", type); }
-          for (const auto &kv : entry.headers) {
-            res.set_header(kv.first.c_str(), kv.second);
-          }
-          res.status = req.has_header("Range") ? 206 : 200;
-          if (!head && file_request_handler_) {
-            file_request_handler_(req, res);
-          }
-          return true;
-        }
-      }
-    }
-  }
-  return false;
-}
-
-inline socket_t
-Server::create_server_socket(const std::string &host, int port,
-                             int socket_flags,
-                             SocketOptions socket_options) const {
-  return detail::create_socket(
-      host, std::string(), port, address_family_, socket_flags, tcp_nodelay_,
-      std::move(socket_options),
-      [](socket_t sock, struct addrinfo &ai) -> bool {
-        if (::bind(sock, ai.ai_addr, static_cast<socklen_t>(ai.ai_addrlen))) {
-          return false;
-        }
-        if (::listen(sock, CPPHTTPLIB_LISTEN_BACKLOG)) { return false; }
-        return true;
-      });
-}
-
-inline int Server::bind_internal(const std::string &host, int port,
-                                 int socket_flags) {
-  if (!is_valid()) { return -1; }
-
-  svr_sock_ = create_server_socket(host, port, socket_flags, socket_options_);
-  if (svr_sock_ == INVALID_SOCKET) { return -1; }
-
-  if (port == 0) {
-    struct sockaddr_storage addr;
-    socklen_t addr_len = sizeof(addr);
-    if (getsockname(svr_sock_, reinterpret_cast<struct sockaddr *>(&addr),
-                    &addr_len) == -1) {
-      return -1;
-    }
-    if (addr.ss_family == AF_INET) {
-      return ntohs(reinterpret_cast<struct sockaddr_in *>(&addr)->sin_port);
-    } else if (addr.ss_family == AF_INET6) {
-      return ntohs(reinterpret_cast<struct sockaddr_in6 *>(&addr)->sin6_port);
-    } else {
-      return -1;
-    }
-  } else {
-    return port;
-  }
-}
-
-inline bool Server::listen_internal() {
-  auto ret = true;
-  is_running_ = true;
-  auto se = detail::scope_exit([&]() { is_running_ = false; });
-
-  {
-    std::unique_ptr<TaskQueue> task_queue(new_task_queue());
-
-    while (svr_sock_ != INVALID_SOCKET) {
-#ifndef _WIN32
-      if (idle_interval_sec_ > 0 || idle_interval_usec_ > 0) {
-#endif
-        auto val = detail::select_read(svr_sock_, idle_interval_sec_,
-                                       idle_interval_usec_);
-        if (val == 0) { // Timeout
-          task_queue->on_idle();
-          continue;
-        }
-#ifndef _WIN32
-      }
-#endif
-      socket_t sock = accept(svr_sock_, nullptr, nullptr);
-
-      if (sock == INVALID_SOCKET) {
-        if (errno == EMFILE) {
-          // The per-process limit of open file descriptors has been reached.
-          // Try to accept new connections after a short sleep.
-          std::this_thread::sleep_for(std::chrono::milliseconds(1));
-          continue;
-        } else if (errno == EINTR || errno == EAGAIN) {
-          continue;
-        }
-        if (svr_sock_ != INVALID_SOCKET) {
-          detail::close_socket(svr_sock_);
-          ret = false;
-        } else {
-          ; // The server socket was closed by user.
-        }
-        break;
-      }
-
-      {
-#ifdef _WIN32
-        auto timeout = static_cast<uint32_t>(read_timeout_sec_ * 1000 +
-                                             read_timeout_usec_ / 1000);
-        setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout,
-                   sizeof(timeout));
-#else
-        timeval tv;
-        tv.tv_sec = static_cast<long>(read_timeout_sec_);
-        tv.tv_usec = static_cast<decltype(tv.tv_usec)>(read_timeout_usec_);
-        setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv));
-#endif
-      }
-      {
-
-#ifdef _WIN32
-        auto timeout = static_cast<uint32_t>(write_timeout_sec_ * 1000 +
-                                             write_timeout_usec_ / 1000);
-        setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout,
-                   sizeof(timeout));
-#else
-        timeval tv;
-        tv.tv_sec = static_cast<long>(write_timeout_sec_);
-        tv.tv_usec = static_cast<decltype(tv.tv_usec)>(write_timeout_usec_);
-        setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof(tv));
-#endif
-      }
-
-      task_queue->enqueue([this, sock]() { process_and_close_socket(sock); });
-    }
-
-    task_queue->shutdown();
-  }
-
-  return ret;
-}
-
-inline bool Server::routing(Request &req, Response &res, Stream &strm) {
-  if (pre_routing_handler_ &&
-      pre_routing_handler_(req, res) == HandlerResponse::Handled) {
-    return true;
-  }
-
-  // File handler
-  bool is_head_request = req.method == "HEAD";
-  if ((req.method == "GET" || is_head_request) &&
-      handle_file_request(req, res, is_head_request)) {
-    return true;
-  }
-
-  if (detail::expect_content(req)) {
-    // Content reader handler
-    {
-      ContentReader reader(
-          [&](ContentReceiver receiver) {
-            return read_content_with_content_receiver(
-                strm, req, res, std::move(receiver), nullptr, nullptr);
-          },
-          [&](MultipartContentHeader header, ContentReceiver receiver) {
-            return read_content_with_content_receiver(strm, req, res, nullptr,
-                                                      std::move(header),
-                                                      std::move(receiver));
-          });
-
-      if (req.method == "POST") {
-        if (dispatch_request_for_content_reader(
-                req, res, std::move(reader),
-                post_handlers_for_content_reader_)) {
-          return true;
-        }
-      } else if (req.method == "PUT") {
-        if (dispatch_request_for_content_reader(
-                req, res, std::move(reader),
-                put_handlers_for_content_reader_)) {
-          return true;
-        }
-      } else if (req.method == "PATCH") {
-        if (dispatch_request_for_content_reader(
-                req, res, std::move(reader),
-                patch_handlers_for_content_reader_)) {
-          return true;
-        }
-      } else if (req.method == "DELETE") {
-        if (dispatch_request_for_content_reader(
-                req, res, std::move(reader),
-                delete_handlers_for_content_reader_)) {
-          return true;
-        }
-      }
-    }
-
-    // Read content into `req.body`
-    if (!read_content(strm, req, res)) { return false; }
-  }
-
-  // Regular handler
-  if (req.method == "GET" || req.method == "HEAD") {
-    return dispatch_request(req, res, get_handlers_);
-  } else if (req.method == "POST") {
-    return dispatch_request(req, res, post_handlers_);
-  } else if (req.method == "PUT") {
-    return dispatch_request(req, res, put_handlers_);
-  } else if (req.method == "DELETE") {
-    return dispatch_request(req, res, delete_handlers_);
-  } else if (req.method == "OPTIONS") {
-    return dispatch_request(req, res, options_handlers_);
-  } else if (req.method == "PATCH") {
-    return dispatch_request(req, res, patch_handlers_);
-  }
-
-  res.status = 400;
-  return false;
-}
-
-inline bool Server::dispatch_request(Request &req, Response &res,
-                                     const Handlers &handlers) {
-  for (const auto &x : handlers) {
-    const auto &pattern = x.first;
-    const auto &handler = x.second;
-
-    if (std::regex_match(req.path, req.matches, pattern)) {
-      handler(req, res);
-      return true;
-    }
-  }
-  return false;
-}
-
-inline void Server::apply_ranges(const Request &req, Response &res,
-                                 std::string &content_type,
-                                 std::string &boundary) {
-  if (req.ranges.size() > 1) {
-    boundary = detail::make_multipart_data_boundary();
-
-    auto it = res.headers.find("Content-Type");
-    if (it != res.headers.end()) {
-      content_type = it->second;
-      res.headers.erase(it);
-    }
-
-    res.headers.emplace("Content-Type",
-                        "multipart/byteranges; boundary=" + boundary);
-  }
-
-  auto type = detail::encoding_type(req, res);
-
-  if (res.body.empty()) {
-    if (res.content_length_ > 0) {
-      size_t length = 0;
-      if (req.ranges.empty()) {
-        length = res.content_length_;
-      } else if (req.ranges.size() == 1) {
-        auto offsets =
-            detail::get_range_offset_and_length(req, res.content_length_, 0);
-        auto offset = offsets.first;
-        length = offsets.second;
-        auto content_range = detail::make_content_range_header_field(
-            offset, length, res.content_length_);
-        res.set_header("Content-Range", content_range);
-      } else {
-        length = detail::get_multipart_ranges_data_length(req, res, boundary,
-                                                          content_type);
-      }
-      res.set_header("Content-Length", std::to_string(length));
-    } else {
-      if (res.content_provider_) {
-        if (res.is_chunked_content_provider_) {
-          res.set_header("Transfer-Encoding", "chunked");
-          if (type == detail::EncodingType::Gzip) {
-            res.set_header("Content-Encoding", "gzip");
-          } else if (type == detail::EncodingType::Brotli) {
-            res.set_header("Content-Encoding", "br");
-          }
-        }
-      }
-    }
-  } else {
-    if (req.ranges.empty()) {
-      ;
-    } else if (req.ranges.size() == 1) {
-      auto offsets =
-          detail::get_range_offset_and_length(req, res.body.size(), 0);
-      auto offset = offsets.first;
-      auto length = offsets.second;
-      auto content_range = detail::make_content_range_header_field(
-          offset, length, res.body.size());
-      res.set_header("Content-Range", content_range);
-      if (offset < res.body.size()) {
-        res.body = res.body.substr(offset, length);
-      } else {
-        res.body.clear();
-        res.status = 416;
-      }
-    } else {
-      std::string data;
-      if (detail::make_multipart_ranges_data(req, res, boundary, content_type,
-                                             data)) {
-        res.body.swap(data);
-      } else {
-        res.body.clear();
-        res.status = 416;
-      }
-    }
-
-    if (type != detail::EncodingType::None) {
-      std::unique_ptr<detail::compressor> compressor;
-      std::string content_encoding;
-
-      if (type == detail::EncodingType::Gzip) {
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-        compressor = detail::make_unique<detail::gzip_compressor>();
-        content_encoding = "gzip";
-#endif
-      } else if (type == detail::EncodingType::Brotli) {
-#ifdef CPPHTTPLIB_BROTLI_SUPPORT
-        compressor = detail::make_unique<detail::brotli_compressor>();
-        content_encoding = "br";
-#endif
-      }
-
-      if (compressor) {
-        std::string compressed;
-        if (compressor->compress(res.body.data(), res.body.size(), true,
-                                 [&](const char *data, size_t data_len) {
-                                   compressed.append(data, data_len);
-                                   return true;
-                                 })) {
-          res.body.swap(compressed);
-          res.set_header("Content-Encoding", content_encoding);
-        }
-      }
-    }
-
-    auto length = std::to_string(res.body.size());
-    res.set_header("Content-Length", length);
-  }
-}
-
-inline bool Server::dispatch_request_for_content_reader(
-    Request &req, Response &res, ContentReader content_reader,
-    const HandlersForContentReader &handlers) {
-  for (const auto &x : handlers) {
-    const auto &pattern = x.first;
-    const auto &handler = x.second;
-
-    if (std::regex_match(req.path, req.matches, pattern)) {
-      handler(req, res, content_reader);
-      return true;
-    }
-  }
-  return false;
-}
-
-inline bool
-Server::process_request(Stream &strm, bool close_connection,
-                        bool &connection_closed,
-                        const std::function<void(Request &)> &setup_request) {
-  std::array<char, 2048> buf{};
-
-  detail::stream_line_reader line_reader(strm, buf.data(), buf.size());
-
-  // Connection has been closed on client
-  if (!line_reader.getline()) { return false; }
-
-  Request req;
-  Response res;
-
-  res.version = "HTTP/1.1";
-
-  for (const auto &header : default_headers_) {
-    if (res.headers.find(header.first) == res.headers.end()) {
-      res.headers.insert(header);
-    }
-  }
-
-#ifdef _WIN32
-  // TODO: Increase FD_SETSIZE statically (libzmq), dynamically (MySQL).
-#else
-#ifndef CPPHTTPLIB_USE_POLL
-  // Socket file descriptor exceeded FD_SETSIZE...
-  if (strm.socket() >= FD_SETSIZE) {
-    Headers dummy;
-    detail::read_headers(strm, dummy);
-    res.status = 500;
-    return write_response(strm, close_connection, req, res);
-  }
-#endif
-#endif
-
-  // Check if the request URI doesn't exceed the limit
-  if (line_reader.size() > CPPHTTPLIB_REQUEST_URI_MAX_LENGTH) {
-    Headers dummy;
-    detail::read_headers(strm, dummy);
-    res.status = 414;
-    return write_response(strm, close_connection, req, res);
-  }
-
-  // Request line and headers
-  if (!parse_request_line(line_reader.ptr(), req) ||
-      !detail::read_headers(strm, req.headers)) {
-    res.status = 400;
-    return write_response(strm, close_connection, req, res);
-  }
-
-  if (req.get_header_value("Connection") == "close") {
-    connection_closed = true;
-  }
-
-  if (req.version == "HTTP/1.0" &&
-      req.get_header_value("Connection") != "Keep-Alive") {
-    connection_closed = true;
-  }
-
-  strm.get_remote_ip_and_port(req.remote_addr, req.remote_port);
-  req.set_header("REMOTE_ADDR", req.remote_addr);
-  req.set_header("REMOTE_PORT", std::to_string(req.remote_port));
-
-  strm.get_local_ip_and_port(req.local_addr, req.local_port);
-  req.set_header("LOCAL_ADDR", req.local_addr);
-  req.set_header("LOCAL_PORT", std::to_string(req.local_port));
-
-  if (req.has_header("Range")) {
-    const auto &range_header_value = req.get_header_value("Range");
-    if (!detail::parse_range_header(range_header_value, req.ranges)) {
-      res.status = 416;
-      return write_response(strm, close_connection, req, res);
-    }
-  }
-
-  if (setup_request) { setup_request(req); }
-
-  if (req.get_header_value("Expect") == "100-continue") {
-    auto status = 100;
-    if (expect_100_continue_handler_) {
-      status = expect_100_continue_handler_(req, res);
-    }
-    switch (status) {
-    case 100:
-    case 417:
-      strm.write_format("HTTP/1.1 %d %s\r\n\r\n", status,
-                        detail::status_message(status));
-      break;
-    default: return write_response(strm, close_connection, req, res);
-    }
-  }
-
-  // Rounting
-  bool routed = false;
-#ifdef CPPHTTPLIB_NO_EXCEPTIONS
-  routed = routing(req, res, strm);
-#else
-  try {
-    routed = routing(req, res, strm);
-  } catch (std::exception &e) {
-    if (exception_handler_) {
-      auto ep = std::current_exception();
-      exception_handler_(req, res, ep);
-      routed = true;
-    } else {
-      res.status = 500;
-      std::string val;
-      auto s = e.what();
-      for (size_t i = 0; s[i]; i++) {
-        switch (s[i]) {
-        case '\r': val += "\\r"; break;
-        case '\n': val += "\\n"; break;
-        default: val += s[i]; break;
-        }
-      }
-      res.set_header("EXCEPTION_WHAT", val);
-    }
-  } catch (...) {
-    if (exception_handler_) {
-      auto ep = std::current_exception();
-      exception_handler_(req, res, ep);
-      routed = true;
-    } else {
-      res.status = 500;
-      res.set_header("EXCEPTION_WHAT", "UNKNOWN");
-    }
-  }
-#endif
-
-  if (routed) {
-    if (res.status == -1) { res.status = req.ranges.empty() ? 200 : 206; }
-    return write_response_with_content(strm, close_connection, req, res);
-  } else {
-    if (res.status == -1) { res.status = 404; }
-    return write_response(strm, close_connection, req, res);
-  }
-}
-
-inline bool Server::is_valid() const { return true; }
-
-inline bool Server::process_and_close_socket(socket_t sock) {
-  auto ret = detail::process_server_socket(
-      svr_sock_, sock, keep_alive_max_count_, keep_alive_timeout_sec_,
-      read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
-      write_timeout_usec_,
-      [this](Stream &strm, bool close_connection, bool &connection_closed) {
-        return process_request(strm, close_connection, connection_closed,
-                               nullptr);
-      });
-
-  detail::shutdown_socket(sock);
-  detail::close_socket(sock);
-  return ret;
-}
-
-// HTTP client implementation
-inline ClientImpl::ClientImpl(const std::string &host)
-    : ClientImpl(host, 80, std::string(), std::string()) {}
-
-inline ClientImpl::ClientImpl(const std::string &host, int port)
-    : ClientImpl(host, port, std::string(), std::string()) {}
-
-inline ClientImpl::ClientImpl(const std::string &host, int port,
-                              const std::string &client_cert_path,
-                              const std::string &client_key_path)
-    : host_(host), port_(port),
-      host_and_port_(adjust_host_string(host) + ":" + std::to_string(port)),
-      client_cert_path_(client_cert_path), client_key_path_(client_key_path) {}
-
-inline ClientImpl::~ClientImpl() {
-  std::lock_guard<std::mutex> guard(socket_mutex_);
-  shutdown_socket(socket_);
-  close_socket(socket_);
-}
-
-inline bool ClientImpl::is_valid() const { return true; }
-
-inline void ClientImpl::copy_settings(const ClientImpl &rhs) {
-  client_cert_path_ = rhs.client_cert_path_;
-  client_key_path_ = rhs.client_key_path_;
-  connection_timeout_sec_ = rhs.connection_timeout_sec_;
-  read_timeout_sec_ = rhs.read_timeout_sec_;
-  read_timeout_usec_ = rhs.read_timeout_usec_;
-  write_timeout_sec_ = rhs.write_timeout_sec_;
-  write_timeout_usec_ = rhs.write_timeout_usec_;
-  basic_auth_username_ = rhs.basic_auth_username_;
-  basic_auth_password_ = rhs.basic_auth_password_;
-  bearer_token_auth_token_ = rhs.bearer_token_auth_token_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  digest_auth_username_ = rhs.digest_auth_username_;
-  digest_auth_password_ = rhs.digest_auth_password_;
-#endif
-  keep_alive_ = rhs.keep_alive_;
-  follow_location_ = rhs.follow_location_;
-  url_encode_ = rhs.url_encode_;
-  address_family_ = rhs.address_family_;
-  tcp_nodelay_ = rhs.tcp_nodelay_;
-  socket_options_ = rhs.socket_options_;
-  compress_ = rhs.compress_;
-  decompress_ = rhs.decompress_;
-  interface_ = rhs.interface_;
-  proxy_host_ = rhs.proxy_host_;
-  proxy_port_ = rhs.proxy_port_;
-  proxy_basic_auth_username_ = rhs.proxy_basic_auth_username_;
-  proxy_basic_auth_password_ = rhs.proxy_basic_auth_password_;
-  proxy_bearer_token_auth_token_ = rhs.proxy_bearer_token_auth_token_;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  proxy_digest_auth_username_ = rhs.proxy_digest_auth_username_;
-  proxy_digest_auth_password_ = rhs.proxy_digest_auth_password_;
-#endif
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  ca_cert_file_path_ = rhs.ca_cert_file_path_;
-  ca_cert_dir_path_ = rhs.ca_cert_dir_path_;
-  ca_cert_store_ = rhs.ca_cert_store_;
-#endif
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  server_certificate_verification_ = rhs.server_certificate_verification_;
-#endif
-  logger_ = rhs.logger_;
-}
-
-inline socket_t ClientImpl::create_client_socket(Error &error) const {
-  if (!proxy_host_.empty() && proxy_port_ != -1) {
-    return detail::create_client_socket(
-        proxy_host_, std::string(), proxy_port_, address_family_, tcp_nodelay_,
-        socket_options_, connection_timeout_sec_, connection_timeout_usec_,
-        read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
-        write_timeout_usec_, interface_, error);
-  }
-
-  // Check is custom IP specified for host_
-  std::string ip;
-  auto it = addr_map_.find(host_);
-  if (it != addr_map_.end()) ip = it->second;
-
-  return detail::create_client_socket(
-      host_, ip, port_, address_family_, tcp_nodelay_, socket_options_,
-      connection_timeout_sec_, connection_timeout_usec_, read_timeout_sec_,
-      read_timeout_usec_, write_timeout_sec_, write_timeout_usec_, interface_,
-      error);
-}
-
-inline bool ClientImpl::create_and_connect_socket(Socket &socket,
-                                                  Error &error) {
-  auto sock = create_client_socket(error);
-  if (sock == INVALID_SOCKET) { return false; }
-  socket.sock = sock;
-  return true;
-}
-
-inline void ClientImpl::shutdown_ssl(Socket & /*socket*/,
-                                     bool /*shutdown_gracefully*/) {
-  // If there are any requests in flight from threads other than us, then it's
-  // a thread-unsafe race because individual ssl* objects are not thread-safe.
-  assert(socket_requests_in_flight_ == 0 ||
-         socket_requests_are_from_thread_ == std::this_thread::get_id());
-}
-
-inline void ClientImpl::shutdown_socket(Socket &socket) {
-  if (socket.sock == INVALID_SOCKET) { return; }
-  detail::shutdown_socket(socket.sock);
-}
-
-inline void ClientImpl::close_socket(Socket &socket) {
-  // If there are requests in flight in another thread, usually closing
-  // the socket will be fine and they will simply receive an error when
-  // using the closed socket, but it is still a bug since rarely the OS
-  // may reassign the socket id to be used for a new socket, and then
-  // suddenly they will be operating on a live socket that is different
-  // than the one they intended!
-  assert(socket_requests_in_flight_ == 0 ||
-         socket_requests_are_from_thread_ == std::this_thread::get_id());
-
-  // It is also a bug if this happens while SSL is still active
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  assert(socket.ssl == nullptr);
-#endif
-  if (socket.sock == INVALID_SOCKET) { return; }
-  detail::close_socket(socket.sock);
-  socket.sock = INVALID_SOCKET;
-}
-
-inline bool ClientImpl::read_response_line(Stream &strm, const Request &req,
-                                           Response &res) {
-  std::array<char, 2048> buf{};
-
-  detail::stream_line_reader line_reader(strm, buf.data(), buf.size());
-
-  if (!line_reader.getline()) { return false; }
-
-#ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR
-  const static std::regex re("(HTTP/1\\.[01]) (\\d{3})(?: (.*?))?\r\n");
-#else
-  const static std::regex re("(HTTP/1\\.[01]) (\\d{3})(?: (.*?))?\r?\n");
-#endif
-
-  std::cmatch m;
-  if (!std::regex_match(line_reader.ptr(), m, re)) {
-    return req.method == "CONNECT";
-  }
-  res.version = std::string(m[1]);
-  res.status = std::stoi(std::string(m[2]));
-  res.reason = std::string(m[3]);
-
-  // Ignore '100 Continue'
-  while (res.status == 100) {
-    if (!line_reader.getline()) { return false; } // CRLF
-    if (!line_reader.getline()) { return false; } // next response line
-
-    if (!std::regex_match(line_reader.ptr(), m, re)) { return false; }
-    res.version = std::string(m[1]);
-    res.status = std::stoi(std::string(m[2]));
-    res.reason = std::string(m[3]);
-  }
-
-  return true;
-}
-
-inline bool ClientImpl::send(Request &req, Response &res, Error &error) {
-  std::lock_guard<std::recursive_mutex> request_mutex_guard(request_mutex_);
-  auto ret = send_(req, res, error);
-  if (error == Error::SSLPeerCouldBeClosed_) {
-    assert(!ret);
-    ret = send_(req, res, error);
-  }
-  return ret;
-}
-
-inline bool ClientImpl::send_(Request &req, Response &res, Error &error) {
-  {
-    std::lock_guard<std::mutex> guard(socket_mutex_);
-
-    // Set this to false immediately - if it ever gets set to true by the end of
-    // the request, we know another thread instructed us to close the socket.
-    socket_should_be_closed_when_request_is_done_ = false;
-
-    auto is_alive = false;
-    if (socket_.is_open()) {
-      is_alive = detail::is_socket_alive(socket_.sock);
-      if (!is_alive) {
-        // Attempt to avoid sigpipe by shutting down nongracefully if it seems
-        // like the other side has already closed the connection Also, there
-        // cannot be any requests in flight from other threads since we locked
-        // request_mutex_, so safe to close everything immediately
-        const bool shutdown_gracefully = false;
-        shutdown_ssl(socket_, shutdown_gracefully);
-        shutdown_socket(socket_);
-        close_socket(socket_);
-      }
-    }
-
-    if (!is_alive) {
-      if (!create_and_connect_socket(socket_, error)) { return false; }
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-      // TODO: refactoring
-      if (is_ssl()) {
-        auto &scli = static_cast<SSLClient &>(*this);
-        if (!proxy_host_.empty() && proxy_port_ != -1) {
-          auto success = false;
-          if (!scli.connect_with_proxy(socket_, res, success, error)) {
-            return success;
-          }
-        }
-
-        if (!scli.initialize_ssl(socket_, error)) { return false; }
-      }
-#endif
-    }
-
-    // Mark the current socket as being in use so that it cannot be closed by
-    // anyone else while this request is ongoing, even though we will be
-    // releasing the mutex.
-    if (socket_requests_in_flight_ > 1) {
-      assert(socket_requests_are_from_thread_ == std::this_thread::get_id());
-    }
-    socket_requests_in_flight_ += 1;
-    socket_requests_are_from_thread_ = std::this_thread::get_id();
-  }
-
-  for (const auto &header : default_headers_) {
-    if (req.headers.find(header.first) == req.headers.end()) {
-      req.headers.insert(header);
-    }
-  }
-
-  auto ret = false;
-  auto close_connection = !keep_alive_;
-
-  auto se = detail::scope_exit([&]() {
-    // Briefly lock mutex in order to mark that a request is no longer ongoing
-    std::lock_guard<std::mutex> guard(socket_mutex_);
-    socket_requests_in_flight_ -= 1;
-    if (socket_requests_in_flight_ <= 0) {
-      assert(socket_requests_in_flight_ == 0);
-      socket_requests_are_from_thread_ = std::thread::id();
-    }
-
-    if (socket_should_be_closed_when_request_is_done_ || close_connection ||
-        !ret) {
-      shutdown_ssl(socket_, true);
-      shutdown_socket(socket_);
-      close_socket(socket_);
-    }
-  });
-
-  ret = process_socket(socket_, [&](Stream &strm) {
-    return handle_request(strm, req, res, close_connection, error);
-  });
-
-  if (!ret) {
-    if (error == Error::Success) { error = Error::Unknown; }
-  }
-
-  return ret;
-}
-
-inline Result ClientImpl::send(const Request &req) {
-  auto req2 = req;
-  return send_(std::move(req2));
-}
-
-inline Result ClientImpl::send_(Request &&req) {
-  auto res = detail::make_unique<Response>();
-  auto error = Error::Success;
-  auto ret = send(req, *res, error);
-  return Result{ret ? std::move(res) : nullptr, error, std::move(req.headers)};
-}
-
-inline bool ClientImpl::handle_request(Stream &strm, Request &req,
-                                       Response &res, bool close_connection,
-                                       Error &error) {
-  if (req.path.empty()) {
-    error = Error::Connection;
-    return false;
-  }
-
-  auto req_save = req;
-
-  bool ret;
-
-  if (!is_ssl() && !proxy_host_.empty() && proxy_port_ != -1) {
-    auto req2 = req;
-    req2.path = "http://" + host_and_port_ + req.path;
-    ret = process_request(strm, req2, res, close_connection, error);
-    req = req2;
-    req.path = req_save.path;
-  } else {
-    ret = process_request(strm, req, res, close_connection, error);
-  }
-
-  if (!ret) { return false; }
-
-  if (300 < res.status && res.status < 400 && follow_location_) {
-    req = req_save;
-    ret = redirect(req, res, error);
-  }
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  if ((res.status == 401 || res.status == 407) &&
-      req.authorization_count_ < 5) {
-    auto is_proxy = res.status == 407;
-    const auto &username =
-        is_proxy ? proxy_digest_auth_username_ : digest_auth_username_;
-    const auto &password =
-        is_proxy ? proxy_digest_auth_password_ : digest_auth_password_;
-
-    if (!username.empty() && !password.empty()) {
-      std::map<std::string, std::string> auth;
-      if (detail::parse_www_authenticate(res, auth, is_proxy)) {
-        Request new_req = req;
-        new_req.authorization_count_ += 1;
-        new_req.headers.erase(is_proxy ? "Proxy-Authorization"
-                                       : "Authorization");
-        new_req.headers.insert(detail::make_digest_authentication_header(
-            req, auth, new_req.authorization_count_, detail::random_string(10),
-            username, password, is_proxy));
-
-        Response new_res;
-
-        ret = send(new_req, new_res, error);
-        if (ret) { res = new_res; }
-      }
-    }
-  }
-#endif
-
-  return ret;
-}
-
-inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) {
-  if (req.redirect_count_ == 0) {
-    error = Error::ExceedRedirectCount;
-    return false;
-  }
-
-  auto location = res.get_header_value("location");
-  if (location.empty()) { return false; }
-
-  const static std::regex re(
-      R"((?:(https?):)?(?://(?:\[([\d:]+)\]|([^:/?#]+))(?::(\d+))?)?([^?#]*)(\?[^#]*)?(?:#.*)?)");
-
-  std::smatch m;
-  if (!std::regex_match(location, m, re)) { return false; }
-
-  auto scheme = is_ssl() ? "https" : "http";
-
-  auto next_scheme = m[1].str();
-  auto next_host = m[2].str();
-  if (next_host.empty()) { next_host = m[3].str(); }
-  auto port_str = m[4].str();
-  auto next_path = m[5].str();
-  auto next_query = m[6].str();
-
-  auto next_port = port_;
-  if (!port_str.empty()) {
-    next_port = std::stoi(port_str);
-  } else if (!next_scheme.empty()) {
-    next_port = next_scheme == "https" ? 443 : 80;
-  }
-
-  if (next_scheme.empty()) { next_scheme = scheme; }
-  if (next_host.empty()) { next_host = host_; }
-  if (next_path.empty()) { next_path = "/"; }
-
-  auto path = detail::decode_url(next_path, true) + next_query;
-
-  if (next_scheme == scheme && next_host == host_ && next_port == port_) {
-    return detail::redirect(*this, req, res, path, location, error);
-  } else {
-    if (next_scheme == "https") {
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-      SSLClient cli(next_host.c_str(), next_port);
-      cli.copy_settings(*this);
-      if (ca_cert_store_) { cli.set_ca_cert_store(ca_cert_store_); }
-      return detail::redirect(cli, req, res, path, location, error);
-#else
-      return false;
-#endif
-    } else {
-      ClientImpl cli(next_host.c_str(), next_port);
-      cli.copy_settings(*this);
-      return detail::redirect(cli, req, res, path, location, error);
-    }
-  }
-}
-
-inline bool ClientImpl::write_content_with_provider(Stream &strm,
-                                                    const Request &req,
-                                                    Error &error) {
-  auto is_shutting_down = []() { return false; };
-
-  if (req.is_chunked_content_provider_) {
-    // TODO: Brotli support
-    std::unique_ptr<detail::compressor> compressor;
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-    if (compress_) {
-      compressor = detail::make_unique<detail::gzip_compressor>();
-    } else
-#endif
-    {
-      compressor = detail::make_unique<detail::nocompressor>();
-    }
-
-    return detail::write_content_chunked(strm, req.content_provider_,
-                                         is_shutting_down, *compressor, error);
-  } else {
-    return detail::write_content(strm, req.content_provider_, 0,
-                                 req.content_length_, is_shutting_down, error);
-  }
-}
-
-inline bool ClientImpl::write_request(Stream &strm, Request &req,
-                                      bool close_connection, Error &error) {
-  // Prepare additional headers
-  if (close_connection) {
-    if (!req.has_header("Connection")) {
-      req.headers.emplace("Connection", "close");
-    }
-  }
-
-  if (!req.has_header("Host")) {
-    if (is_ssl()) {
-      if (port_ == 443) {
-        req.headers.emplace("Host", host_);
-      } else {
-        req.headers.emplace("Host", host_and_port_);
-      }
-    } else {
-      if (port_ == 80) {
-        req.headers.emplace("Host", host_);
-      } else {
-        req.headers.emplace("Host", host_and_port_);
-      }
-    }
-  }
-
-  if (!req.has_header("Accept")) { req.headers.emplace("Accept", "*/*"); }
-
-#ifndef CPPHTTPLIB_NO_DEFAULT_USER_AGENT
-  if (!req.has_header("User-Agent")) {
-    auto agent = std::string("cpp-httplib/") + CPPHTTPLIB_VERSION;
-    req.headers.emplace("User-Agent", agent);
-  }
-#endif
-
-  if (req.body.empty()) {
-    if (req.content_provider_) {
-      if (!req.is_chunked_content_provider_) {
-        if (!req.has_header("Content-Length")) {
-          auto length = std::to_string(req.content_length_);
-          req.headers.emplace("Content-Length", length);
-        }
-      }
-    } else {
-      if (req.method == "POST" || req.method == "PUT" ||
-          req.method == "PATCH") {
-        req.headers.emplace("Content-Length", "0");
-      }
-    }
-  } else {
-    if (!req.has_header("Content-Type")) {
-      req.headers.emplace("Content-Type", "text/plain");
-    }
-
-    if (!req.has_header("Content-Length")) {
-      auto length = std::to_string(req.body.size());
-      req.headers.emplace("Content-Length", length);
-    }
-  }
-
-  if (!basic_auth_password_.empty() || !basic_auth_username_.empty()) {
-    if (!req.has_header("Authorization")) {
-      req.headers.insert(make_basic_authentication_header(
-          basic_auth_username_, basic_auth_password_, false));
-    }
-  }
-
-  if (!proxy_basic_auth_username_.empty() &&
-      !proxy_basic_auth_password_.empty()) {
-    if (!req.has_header("Proxy-Authorization")) {
-      req.headers.insert(make_basic_authentication_header(
-          proxy_basic_auth_username_, proxy_basic_auth_password_, true));
-    }
-  }
-
-  if (!bearer_token_auth_token_.empty()) {
-    if (!req.has_header("Authorization")) {
-      req.headers.insert(make_bearer_token_authentication_header(
-          bearer_token_auth_token_, false));
-    }
-  }
-
-  if (!proxy_bearer_token_auth_token_.empty()) {
-    if (!req.has_header("Proxy-Authorization")) {
-      req.headers.insert(make_bearer_token_authentication_header(
-          proxy_bearer_token_auth_token_, true));
-    }
-  }
-
-  // Request line and headers
-  {
-    detail::BufferStream bstrm;
-
-    const auto &path = url_encode_ ? detail::encode_url(req.path) : req.path;
-    bstrm.write_format("%s %s HTTP/1.1\r\n", req.method.c_str(), path.c_str());
-
-    detail::write_headers(bstrm, req.headers);
-
-    // Flush buffer
-    auto &data = bstrm.get_buffer();
-    if (!detail::write_data(strm, data.data(), data.size())) {
-      error = Error::Write;
-      return false;
-    }
-  }
-
-  // Body
-  if (req.body.empty()) {
-    return write_content_with_provider(strm, req, error);
-  }
-
-  if (!detail::write_data(strm, req.body.data(), req.body.size())) {
-    error = Error::Write;
-    return false;
-  }
-
-  return true;
-}
-
-inline std::unique_ptr<Response> ClientImpl::send_with_content_provider(
-    Request &req, const char *body, size_t content_length,
-    ContentProvider content_provider,
-    ContentProviderWithoutLength content_provider_without_length,
-    const std::string &content_type, Error &error) {
-  if (!content_type.empty()) {
-    req.headers.emplace("Content-Type", content_type);
-  }
-
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-  if (compress_) { req.headers.emplace("Content-Encoding", "gzip"); }
-#endif
-
-#ifdef CPPHTTPLIB_ZLIB_SUPPORT
-  if (compress_ && !content_provider_without_length) {
-    // TODO: Brotli support
-    detail::gzip_compressor compressor;
-
-    if (content_provider) {
-      auto ok = true;
-      size_t offset = 0;
-      DataSink data_sink;
-
-      data_sink.write = [&](const char *data, size_t data_len) -> bool {
-        if (ok) {
-          auto last = offset + data_len == content_length;
-
-          auto ret = compressor.compress(
-              data, data_len, last,
-              [&](const char *compressed_data, size_t compressed_data_len) {
-                req.body.append(compressed_data, compressed_data_len);
-                return true;
-              });
-
-          if (ret) {
-            offset += data_len;
-          } else {
-            ok = false;
-          }
-        }
-        return ok;
-      };
-
-      while (ok && offset < content_length) {
-        if (!content_provider(offset, content_length - offset, data_sink)) {
-          error = Error::Canceled;
-          return nullptr;
-        }
-      }
-    } else {
-      if (!compressor.compress(body, content_length, true,
-                               [&](const char *data, size_t data_len) {
-                                 req.body.append(data, data_len);
-                                 return true;
-                               })) {
-        error = Error::Compression;
-        return nullptr;
-      }
-    }
-  } else
-#endif
-  {
-    if (content_provider) {
-      req.content_length_ = content_length;
-      req.content_provider_ = std::move(content_provider);
-      req.is_chunked_content_provider_ = false;
-    } else if (content_provider_without_length) {
-      req.content_length_ = 0;
-      req.content_provider_ = detail::ContentProviderAdapter(
-          std::move(content_provider_without_length));
-      req.is_chunked_content_provider_ = true;
-      req.headers.emplace("Transfer-Encoding", "chunked");
-    } else {
-      req.body.assign(body, content_length);
-      ;
-    }
-  }
-
-  auto res = detail::make_unique<Response>();
-  return send(req, *res, error) ? std::move(res) : nullptr;
-}
-
-inline Result ClientImpl::send_with_content_provider(
-    const std::string &method, const std::string &path, const Headers &headers,
-    const char *body, size_t content_length, ContentProvider content_provider,
-    ContentProviderWithoutLength content_provider_without_length,
-    const std::string &content_type) {
-  Request req;
-  req.method = method;
-  req.headers = headers;
-  req.path = path;
-
-  auto error = Error::Success;
-
-  auto res = send_with_content_provider(
-      req, body, content_length, std::move(content_provider),
-      std::move(content_provider_without_length), content_type, error);
-
-  return Result{std::move(res), error, std::move(req.headers)};
-}
-
-inline std::string
-ClientImpl::adjust_host_string(const std::string &host) const {
-  if (host.find(':') != std::string::npos) { return "[" + host + "]"; }
-  return host;
-}
-
-inline bool ClientImpl::process_request(Stream &strm, Request &req,
-                                        Response &res, bool close_connection,
-                                        Error &error) {
-  // Send request
-  if (!write_request(strm, req, close_connection, error)) { return false; }
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-  if (is_ssl()) {
-    auto is_proxy_enabled = !proxy_host_.empty() && proxy_port_ != -1;
-    if (!is_proxy_enabled) {
-      char buf[1];
-      if (SSL_peek(socket_.ssl, buf, 1) == 0 &&
-          SSL_get_error(socket_.ssl, 0) == SSL_ERROR_ZERO_RETURN) {
-        error = Error::SSLPeerCouldBeClosed_;
-        return false;
-      }
-    }
-  }
-#endif
-
-  // Receive response and headers
-  if (!read_response_line(strm, req, res) ||
-      !detail::read_headers(strm, res.headers)) {
-    error = Error::Read;
-    return false;
-  }
-
-  // Body
-  if ((res.status != 204) && req.method != "HEAD" && req.method != "CONNECT") {
-    auto redirect = 300 < res.status && res.status < 400 && follow_location_;
-
-    if (req.response_handler && !redirect) {
-      if (!req.response_handler(res)) {
-        error = Error::Canceled;
-        return false;
-      }
-    }
-
-    auto out =
-        req.content_receiver
-            ? static_cast<ContentReceiverWithProgress>(
-                  [&](const char *buf, size_t n, uint64_t off, uint64_t len) {
-                    if (redirect) { return true; }
-                    auto ret = req.content_receiver(buf, n, off, len);
-                    if (!ret) { error = Error::Canceled; }
-                    return ret;
-                  })
-            : static_cast<ContentReceiverWithProgress>(
-                  [&](const char *buf, size_t n, uint64_t /*off*/,
-                      uint64_t /*len*/) {
-                    if (res.body.size() + n > res.body.max_size()) {
-                      return false;
-                    }
-                    res.body.append(buf, n);
-                    return true;
-                  });
-
-    auto progress = [&](uint64_t current, uint64_t total) {
-      if (!req.progress || redirect) { return true; }
-      auto ret = req.progress(current, total);
-      if (!ret) { error = Error::Canceled; }
-      return ret;
-    };
-
-    int dummy_status;
-    if (!detail::read_content(strm, res, (std::numeric_limits<size_t>::max)(),
-                              dummy_status, std::move(progress), std::move(out),
-                              decompress_)) {
-      if (error != Error::Canceled) { error = Error::Read; }
-      return false;
-    }
-  }
-
-  if (res.get_header_value("Connection") == "close" ||
-      (res.version == "HTTP/1.0" && res.reason != "Connection established")) {
-    // TODO this requires a not-entirely-obvious chain of calls to be correct
-    // for this to be safe. Maybe a code refactor (such as moving this out to
-    // the send function and getting rid of the recursiveness of the mutex)
-    // could make this more obvious.
-
-    // This is safe to call because process_request is only called by
-    // handle_request which is only called by send, which locks the request
-    // mutex during the process. It would be a bug to call it from a different
-    // thread since it's a thread-safety issue to do these things to the socket
-    // if another thread is using the socket.
-    std::lock_guard<std::mutex> guard(socket_mutex_);
-    shutdown_ssl(socket_, true);
-    shutdown_socket(socket_);
-    close_socket(socket_);
-  }
-
-  // Log
-  if (logger_) { logger_(req, res); }
-
-  return true;
-}
-
-inline ContentProviderWithoutLength ClientImpl::get_multipart_content_provider(
-    const std::string &boundary, const MultipartFormDataItems &items,
-    const MultipartFormDataProviderItems &provider_items) {
-  size_t cur_item = 0, cur_start = 0;
-  // cur_item and cur_start are copied to within the std::function and maintain
-  // state between successive calls
-  return [&, cur_item, cur_start](size_t offset,
-                                  DataSink &sink) mutable -> bool {
-    if (!offset && items.size()) {
-      sink.os << detail::serialize_multipart_formdata(items, boundary, false);
-      return true;
-    } else if (cur_item < provider_items.size()) {
-      if (!cur_start) {
-        const auto &begin = detail::serialize_multipart_formdata_item_begin(
-            provider_items[cur_item], boundary);
-        offset += begin.size();
-        cur_start = offset;
-        sink.os << begin;
-      }
-
-      DataSink cur_sink;
-      bool has_data = true;
-      cur_sink.write = sink.write;
-      cur_sink.done = [&]() { has_data = false; };
-
-      if (!provider_items[cur_item].provider(offset - cur_start, cur_sink))
-        return false;
-
-      if (!has_data) {
-        sink.os << detail::serialize_multipart_formdata_item_end();
-        cur_item++;
-        cur_start = 0;
-      }
-      return true;
-    } else {
-      sink.os << detail::serialize_multipart_formdata_finish(boundary);
-      sink.done();
-      return true;
-    }
-  };
-}
-
-inline bool
-ClientImpl::process_socket(const Socket &socket,
-                           std::function<bool(Stream &strm)> callback) {
-  return detail::process_client_socket(
-      socket.sock, read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
-      write_timeout_usec_, std::move(callback));
-}
-
-inline bool ClientImpl::is_ssl() const { return false; }
-
-inline Result ClientImpl::Get(const std::string &path) {
-  return Get(path, Headers(), Progress());
-}
-
-inline Result ClientImpl::Get(const std::string &path, Progress progress) {
-  return Get(path, Headers(), std::move(progress));
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Headers &headers) {
-  return Get(path, headers, Progress());
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Headers &headers,
-                              Progress progress) {
-  Request req;
-  req.method = "GET";
-  req.path = path;
-  req.headers = headers;
-  req.progress = std::move(progress);
-
-  return send_(std::move(req));
-}
-
-inline Result ClientImpl::Get(const std::string &path,
-                              ContentReceiver content_receiver) {
-  return Get(path, Headers(), nullptr, std::move(content_receiver), nullptr);
-}
-
-inline Result ClientImpl::Get(const std::string &path,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
-  return Get(path, Headers(), nullptr, std::move(content_receiver),
-             std::move(progress));
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Headers &headers,
-                              ContentReceiver content_receiver) {
-  return Get(path, headers, nullptr, std::move(content_receiver), nullptr);
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Headers &headers,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
-  return Get(path, headers, nullptr, std::move(content_receiver),
-             std::move(progress));
-}
-
-inline Result ClientImpl::Get(const std::string &path,
-                              ResponseHandler response_handler,
-                              ContentReceiver content_receiver) {
-  return Get(path, Headers(), std::move(response_handler),
-             std::move(content_receiver), nullptr);
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Headers &headers,
-                              ResponseHandler response_handler,
-                              ContentReceiver content_receiver) {
-  return Get(path, headers, std::move(response_handler),
-             std::move(content_receiver), nullptr);
-}
-
-inline Result ClientImpl::Get(const std::string &path,
-                              ResponseHandler response_handler,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
-  return Get(path, Headers(), std::move(response_handler),
-             std::move(content_receiver), std::move(progress));
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Headers &headers,
-                              ResponseHandler response_handler,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
-  Request req;
-  req.method = "GET";
-  req.path = path;
-  req.headers = headers;
-  req.response_handler = std::move(response_handler);
-  req.content_receiver =
-      [content_receiver](const char *data, size_t data_length,
-                         uint64_t /*offset*/, uint64_t /*total_length*/) {
-        return content_receiver(data, data_length);
-      };
-  req.progress = std::move(progress);
-
-  return send_(std::move(req));
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Params &params,
-                              const Headers &headers, Progress progress) {
-  if (params.empty()) { return Get(path, headers); }
-
-  std::string path_with_query = append_query_params(path, params);
-  return Get(path_with_query.c_str(), headers, progress);
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Params &params,
-                              const Headers &headers,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
-  return Get(path, params, headers, nullptr, content_receiver, progress);
-}
-
-inline Result ClientImpl::Get(const std::string &path, const Params &params,
-                              const Headers &headers,
-                              ResponseHandler response_handler,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
-  if (params.empty()) {
-    return Get(path, headers, response_handler, content_receiver, progress);
-  }
-
-  std::string path_with_query = append_query_params(path, params);
-  return Get(path_with_query.c_str(), headers, response_handler,
-             content_receiver, progress);
-}
-
-inline Result ClientImpl::Head(const std::string &path) {
-  return Head(path, Headers());
-}
-
-inline Result ClientImpl::Head(const std::string &path,
-                               const Headers &headers) {
-  Request req;
-  req.method = "HEAD";
-  req.headers = headers;
-  req.path = path;
-
-  return send_(std::move(req));
-}
-
-inline Result ClientImpl::Post(const std::string &path) {
-  return Post(path, std::string(), std::string());
-}
-
-inline Result ClientImpl::Post(const std::string &path,
-                               const Headers &headers) {
-  return Post(path, headers, nullptr, 0, std::string());
-}
-
-inline Result ClientImpl::Post(const std::string &path, const char *body,
-                               size_t content_length,
-                               const std::string &content_type) {
-  return Post(path, Headers(), body, content_length, content_type);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const char *body, size_t content_length,
-                               const std::string &content_type) {
-  return send_with_content_provider("POST", path, headers, body, content_length,
-                                    nullptr, nullptr, content_type);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const std::string &body,
-                               const std::string &content_type) {
-  return Post(path, Headers(), body, content_type);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const std::string &body,
-                               const std::string &content_type) {
-  return send_with_content_provider("POST", path, headers, body.data(),
-                                    body.size(), nullptr, nullptr,
-                                    content_type);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Params &params) {
-  return Post(path, Headers(), params);
-}
-
-inline Result ClientImpl::Post(const std::string &path, size_t content_length,
-                               ContentProvider content_provider,
-                               const std::string &content_type) {
-  return Post(path, Headers(), content_length, std::move(content_provider),
-              content_type);
-}
-
-inline Result ClientImpl::Post(const std::string &path,
-                               ContentProviderWithoutLength content_provider,
-                               const std::string &content_type) {
-  return Post(path, Headers(), std::move(content_provider), content_type);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               size_t content_length,
-                               ContentProvider content_provider,
-                               const std::string &content_type) {
-  return send_with_content_provider("POST", path, headers, nullptr,
-                                    content_length, std::move(content_provider),
-                                    nullptr, content_type);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               ContentProviderWithoutLength content_provider,
-                               const std::string &content_type) {
-  return send_with_content_provider("POST", path, headers, nullptr, 0, nullptr,
-                                    std::move(content_provider), content_type);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const Params &params) {
-  auto query = detail::params_to_query_str(params);
-  return Post(path, headers, query, "application/x-www-form-urlencoded");
-}
-
-inline Result ClientImpl::Post(const std::string &path,
-                               const MultipartFormDataItems &items) {
-  return Post(path, Headers(), items);
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const MultipartFormDataItems &items) {
-  const auto &boundary = detail::make_multipart_data_boundary();
-  const auto &content_type =
-      detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Post(path, headers, body, content_type.c_str());
-}
-
-inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
-                               const MultipartFormDataItems &items,
-                               const std::string &boundary) {
-  if (!detail::is_multipart_boundary_chars_valid(boundary)) {
-    return Result{nullptr, Error::UnsupportedMultipartBoundaryChars};
-  }
-
-  const auto &content_type =
-      detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Post(path, headers, body, content_type.c_str());
-}
-
-inline Result
-ClientImpl::Post(const std::string &path, const Headers &headers,
-                 const MultipartFormDataItems &items,
-                 const MultipartFormDataProviderItems &provider_items) {
-  const auto &boundary = detail::make_multipart_data_boundary();
-  const auto &content_type =
-      detail::serialize_multipart_formdata_get_content_type(boundary);
-  return send_with_content_provider(
-      "POST", path, headers, nullptr, 0, nullptr,
-      get_multipart_content_provider(boundary, items, provider_items),
-      content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path) {
-  return Put(path, std::string(), std::string());
-}
-
-inline Result ClientImpl::Put(const std::string &path, const char *body,
-                              size_t content_length,
-                              const std::string &content_type) {
-  return Put(path, Headers(), body, content_length, content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const char *body, size_t content_length,
-                              const std::string &content_type) {
-  return send_with_content_provider("PUT", path, headers, body, content_length,
-                                    nullptr, nullptr, content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const std::string &body,
-                              const std::string &content_type) {
-  return Put(path, Headers(), body, content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const std::string &body,
-                              const std::string &content_type) {
-  return send_with_content_provider("PUT", path, headers, body.data(),
-                                    body.size(), nullptr, nullptr,
-                                    content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path, size_t content_length,
-                              ContentProvider content_provider,
-                              const std::string &content_type) {
-  return Put(path, Headers(), content_length, std::move(content_provider),
-             content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path,
-                              ContentProviderWithoutLength content_provider,
-                              const std::string &content_type) {
-  return Put(path, Headers(), std::move(content_provider), content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              size_t content_length,
-                              ContentProvider content_provider,
-                              const std::string &content_type) {
-  return send_with_content_provider("PUT", path, headers, nullptr,
-                                    content_length, std::move(content_provider),
-                                    nullptr, content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              ContentProviderWithoutLength content_provider,
-                              const std::string &content_type) {
-  return send_with_content_provider("PUT", path, headers, nullptr, 0, nullptr,
-                                    std::move(content_provider), content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Params &params) {
-  return Put(path, Headers(), params);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const Params &params) {
-  auto query = detail::params_to_query_str(params);
-  return Put(path, headers, query, "application/x-www-form-urlencoded");
-}
-
-inline Result ClientImpl::Put(const std::string &path,
-                              const MultipartFormDataItems &items) {
-  return Put(path, Headers(), items);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const MultipartFormDataItems &items) {
-  const auto &boundary = detail::make_multipart_data_boundary();
-  const auto &content_type =
-      detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Put(path, headers, body, content_type);
-}
-
-inline Result ClientImpl::Put(const std::string &path, const Headers &headers,
-                              const MultipartFormDataItems &items,
-                              const std::string &boundary) {
-  if (!detail::is_multipart_boundary_chars_valid(boundary)) {
-    return Result{nullptr, Error::UnsupportedMultipartBoundaryChars};
-  }
-
-  const auto &content_type =
-      detail::serialize_multipart_formdata_get_content_type(boundary);
-  const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Put(path, headers, body, content_type);
-}
-
-inline Result
-ClientImpl::Put(const std::string &path, const Headers &headers,
-                const MultipartFormDataItems &items,
-                const MultipartFormDataProviderItems &provider_items) {
-  const auto &boundary = detail::make_multipart_data_boundary();
-  const auto &content_type =
-      detail::serialize_multipart_formdata_get_content_type(boundary);
-  return send_with_content_provider(
-      "PUT", path, headers, nullptr, 0, nullptr,
-      get_multipart_content_provider(boundary, items, provider_items),
-      content_type);
-}
-inline Result ClientImpl::Patch(const std::string &path) {
-  return Patch(path, std::string(), std::string());
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const char *body,
-                                size_t content_length,
-                                const std::string &content_type) {
-  return Patch(path, Headers(), body, content_length, content_type);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
-                                const char *body, size_t content_length,
-                                const std::string &content_type) {
-  return send_with_content_provider("PATCH", path, headers, body,
-                                    content_length, nullptr, nullptr,
-                                    content_type);
-}
-
-inline Result ClientImpl::Patch(const std::string &path,
-                                const std::string &body,
-                                const std::string &content_type) {
-  return Patch(path, Headers(), body, content_type);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
-                                const std::string &body,
-                                const std::string &content_type) {
-  return send_with_content_provider("PATCH", path, headers, body.data(),
-                                    body.size(), nullptr, nullptr,
-                                    content_type);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, size_t content_length,
-                                ContentProvider content_provider,
-                                const std::string &content_type) {
-  return Patch(path, Headers(), content_length, std::move(content_provider),
-               content_type);
-}
-
-inline Result ClientImpl::Patch(const std::string &path,
-                                ContentProviderWithoutLength content_provider,
-                                const std::string &content_type) {
-  return Patch(path, Headers(), std::move(content_provider), content_type);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
-                                size_t content_length,
-                                ContentProvider content_provider,
-                                const std::string &content_type) {
-  return send_with_content_provider("PATCH", path, headers, nullptr,
-                                    content_length, std::move(content_provider),
-                                    nullptr, content_type);
-}
-
-inline Result ClientImpl::Patch(const std::string &path, const Headers &headers,
-                                ContentProviderWithoutLength content_provider,
-                                const std::string &content_type) {
-  return send_with_content_provider("PATCH", path, headers, nullptr, 0, nullptr,
-                                    std::move(content_provider), content_type);
-}
-
-inline Result ClientImpl::Delete(const std::string &path) {
-  return Delete(path, Headers(), std::string(), std::string());
-}
-
-inline Result ClientImpl::Delete(const std::string &path,
-                                 const Headers &headers) {
-  return Delete(path, headers, std::string(), std::string());
-}
-
-inline Result ClientImpl::Delete(const std::string &path, const char *body,
-                                 size_t content_length,
-                                 const std::string &content_type) {
-  return Delete(path, Headers(), body, content_length, content_type);
-}
-
-inline Result ClientImpl::Delete(const std::string &path,
-                                 const Headers &headers, const char *body,
-                                 size_t content_length,
-                                 const std::string &content_type) {
-  Request req;
-  req.method = "DELETE";
-  req.headers = headers;
-  req.path = path;
-
-  if (!content_type.empty()) {
-    req.headers.emplace("Content-Type", content_type);
-  }
-  req.body.assign(body, content_length);
-
-  return send_(std::move(req));
-}
-
-inline Result ClientImpl::Delete(const std::string &path,
-                                 const std::string &body,
-                                 const std::string &content_type) {
-  return Delete(path, Headers(), body.data(), body.size(), content_type);
-}
-
-inline Result ClientImpl::Delete(const std::string &path,
-                                 const Headers &headers,
-                                 const std::string &body,
-                                 const std::string &content_type) {
-  return Delete(path, headers, body.data(), body.size(), content_type);
-}
-
-inline Result ClientImpl::Options(const std::string &path) {
-  return Options(path, Headers());
-}
-
-inline Result ClientImpl::Options(const std::string &path,
-                                  const Headers &headers) {
-  Request req;
-  req.method = "OPTIONS";
-  req.headers = headers;
-  req.path = path;
-
-  return send_(std::move(req));
-}
-
-inline size_t ClientImpl::is_socket_open() const {
-  std::lock_guard<std::mutex> guard(socket_mutex_);
-  return socket_.is_open();
-}
-
-inline socket_t ClientImpl::socket() const { return socket_.sock; }
-
-inline void ClientImpl::stop() {
-  std::lock_guard<std::mutex> guard(socket_mutex_);
-
-  // If there is anything ongoing right now, the ONLY thread-safe thing we can
-  // do is to shutdown_socket, so that threads using this socket suddenly
-  // discover they can't read/write any more and error out. Everything else
-  // (closing the socket, shutting ssl down) is unsafe because these actions are
-  // not thread-safe.
-  if (socket_requests_in_flight_ > 0) {
-    shutdown_socket(socket_);
-
-    // Aside from that, we set a flag for the socket to be closed when we're
-    // done.
-    socket_should_be_closed_when_request_is_done_ = true;
-    return;
-  }
-
-  // Otherwise, still holding the mutex, we can shut everything down ourselves
-  shutdown_ssl(socket_, true);
-  shutdown_socket(socket_);
-  close_socket(socket_);
-}
-
-inline void ClientImpl::set_connection_timeout(time_t sec, time_t usec) {
-  connection_timeout_sec_ = sec;
-  connection_timeout_usec_ = usec;
-}
-
-inline void ClientImpl::set_read_timeout(time_t sec, time_t usec) {
-  read_timeout_sec_ = sec;
-  read_timeout_usec_ = usec;
-}
-
-inline void ClientImpl::set_write_timeout(time_t sec, time_t usec) {
-  write_timeout_sec_ = sec;
-  write_timeout_usec_ = usec;
-}
-
-inline void ClientImpl::set_basic_auth(const std::string &username,
-                                       const std::string &password) {
-  basic_auth_username_ = username;
-  basic_auth_password_ = password;
-}
-
-inline void ClientImpl::set_bearer_token_auth(const std::string &token) {
-  bearer_token_auth_token_ = token;
-}
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void ClientImpl::set_digest_auth(const std::string &username,
-                                        const std::string &password) {
-  digest_auth_username_ = username;
-  digest_auth_password_ = password;
-}
-#endif
-
-inline void ClientImpl::set_keep_alive(bool on) { keep_alive_ = on; }
-
-inline void ClientImpl::set_follow_location(bool on) { follow_location_ = on; }
-
-inline void ClientImpl::set_url_encode(bool on) { url_encode_ = on; }
-
-inline void
-ClientImpl::set_hostname_addr_map(std::map<std::string, std::string> addr_map) {
-  addr_map_ = std::move(addr_map);
-}
-
-inline void ClientImpl::set_default_headers(Headers headers) {
-  default_headers_ = std::move(headers);
-}
-
-inline void ClientImpl::set_address_family(int family) {
-  address_family_ = family;
-}
-
-inline void ClientImpl::set_tcp_nodelay(bool on) { tcp_nodelay_ = on; }
-
-inline void ClientImpl::set_socket_options(SocketOptions socket_options) {
-  socket_options_ = std::move(socket_options);
-}
-
-inline void ClientImpl::set_compress(bool on) { compress_ = on; }
-
-inline void ClientImpl::set_decompress(bool on) { decompress_ = on; }
-
-inline void ClientImpl::set_interface(const std::string &intf) {
-  interface_ = intf;
-}
-
-inline void ClientImpl::set_proxy(const std::string &host, int port) {
-  proxy_host_ = host;
-  proxy_port_ = port;
-}
-
-inline void ClientImpl::set_proxy_basic_auth(const std::string &username,
-                                             const std::string &password) {
-  proxy_basic_auth_username_ = username;
-  proxy_basic_auth_password_ = password;
-}
-
-inline void ClientImpl::set_proxy_bearer_token_auth(const std::string &token) {
-  proxy_bearer_token_auth_token_ = token;
-}
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void ClientImpl::set_proxy_digest_auth(const std::string &username,
-                                              const std::string &password) {
-  proxy_digest_auth_username_ = username;
-  proxy_digest_auth_password_ = password;
-}
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void ClientImpl::set_ca_cert_path(const std::string &ca_cert_file_path,
-                                         const std::string &ca_cert_dir_path) {
-  ca_cert_file_path_ = ca_cert_file_path;
-  ca_cert_dir_path_ = ca_cert_dir_path;
-}
-
-inline void ClientImpl::set_ca_cert_store(X509_STORE *ca_cert_store) {
-  if (ca_cert_store && ca_cert_store != ca_cert_store_) {
-    ca_cert_store_ = ca_cert_store;
-  }
-}
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void ClientImpl::enable_server_certificate_verification(bool enabled) {
-  server_certificate_verification_ = enabled;
-}
-#endif
-
-inline void ClientImpl::set_logger(Logger logger) {
-  logger_ = std::move(logger);
-}
-
-/*
- * SSL Implementation
- */
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-namespace detail {
-
-template <typename U, typename V>
-inline SSL *ssl_new(socket_t sock, SSL_CTX *ctx, std::mutex &ctx_mutex,
-                    U SSL_connect_or_accept, V setup) {
-  SSL *ssl = nullptr;
-  {
-    std::lock_guard<std::mutex> guard(ctx_mutex);
-    ssl = SSL_new(ctx);
-  }
-
-  if (ssl) {
-    set_nonblocking(sock, true);
-    auto bio = BIO_new_socket(static_cast<int>(sock), BIO_NOCLOSE);
-    BIO_set_nbio(bio, 1);
-    SSL_set_bio(ssl, bio, bio);
-
-    if (!setup(ssl) || SSL_connect_or_accept(ssl) != 1) {
-      SSL_shutdown(ssl);
-      {
-        std::lock_guard<std::mutex> guard(ctx_mutex);
-        SSL_free(ssl);
-      }
-      set_nonblocking(sock, false);
-      return nullptr;
-    }
-    BIO_set_nbio(bio, 0);
-    set_nonblocking(sock, false);
-  }
-
-  return ssl;
-}
-
-inline void ssl_delete(std::mutex &ctx_mutex, SSL *ssl,
-                       bool shutdown_gracefully) {
-  // sometimes we may want to skip this to try to avoid SIGPIPE if we know
-  // the remote has closed the network connection
-  // Note that it is not always possible to avoid SIGPIPE, this is merely a
-  // best-efforts.
-  if (shutdown_gracefully) { SSL_shutdown(ssl); }
-
-  std::lock_guard<std::mutex> guard(ctx_mutex);
-  SSL_free(ssl);
-}
-
-template <typename U>
-bool ssl_connect_or_accept_nonblocking(socket_t sock, SSL *ssl,
-                                       U ssl_connect_or_accept,
-                                       time_t timeout_sec,
-                                       time_t timeout_usec) {
-  int res = 0;
-  while ((res = ssl_connect_or_accept(ssl)) != 1) {
-    auto err = SSL_get_error(ssl, res);
-    switch (err) {
-    case SSL_ERROR_WANT_READ:
-      if (select_read(sock, timeout_sec, timeout_usec) > 0) { continue; }
-      break;
-    case SSL_ERROR_WANT_WRITE:
-      if (select_write(sock, timeout_sec, timeout_usec) > 0) { continue; }
-      break;
-    default: break;
-    }
-    return false;
-  }
-  return true;
-}
-
-template <typename T>
-inline bool process_server_socket_ssl(
-    const std::atomic<socket_t> &svr_sock, SSL *ssl, socket_t sock,
-    size_t keep_alive_max_count, time_t keep_alive_timeout_sec,
-    time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec,
-    time_t write_timeout_usec, T callback) {
-  return process_server_socket_core(
-      svr_sock, sock, keep_alive_max_count, keep_alive_timeout_sec,
-      [&](bool close_connection, bool &connection_closed) {
-        SSLSocketStream strm(sock, ssl, read_timeout_sec, read_timeout_usec,
-                             write_timeout_sec, write_timeout_usec);
-        return callback(strm, close_connection, connection_closed);
-      });
-}
-
-template <typename T>
-inline bool
-process_client_socket_ssl(SSL *ssl, socket_t sock, time_t read_timeout_sec,
-                          time_t read_timeout_usec, time_t write_timeout_sec,
-                          time_t write_timeout_usec, T callback) {
-  SSLSocketStream strm(sock, ssl, read_timeout_sec, read_timeout_usec,
-                       write_timeout_sec, write_timeout_usec);
-  return callback(strm);
-}
-
-class SSLInit {
-public:
-  SSLInit() {
-    OPENSSL_init_ssl(
-        OPENSSL_INIT_LOAD_SSL_STRINGS | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
-  }
-};
-
-// SSL socket stream implementation
-inline SSLSocketStream::SSLSocketStream(socket_t sock, SSL *ssl,
-                                        time_t read_timeout_sec,
-                                        time_t read_timeout_usec,
-                                        time_t write_timeout_sec,
-                                        time_t write_timeout_usec)
-    : sock_(sock), ssl_(ssl), read_timeout_sec_(read_timeout_sec),
-      read_timeout_usec_(read_timeout_usec),
-      write_timeout_sec_(write_timeout_sec),
-      write_timeout_usec_(write_timeout_usec) {
-  SSL_clear_mode(ssl, SSL_MODE_AUTO_RETRY);
-}
-
-inline SSLSocketStream::~SSLSocketStream() {}
-
-inline bool SSLSocketStream::is_readable() const {
-  return detail::select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0;
-}
-
-inline bool SSLSocketStream::is_writable() const {
-  return select_write(sock_, write_timeout_sec_, write_timeout_usec_) > 0 &&
-         is_socket_alive(sock_);
-}
-
-inline ssize_t SSLSocketStream::read(char *ptr, size_t size) {
-  if (SSL_pending(ssl_) > 0) {
-    return SSL_read(ssl_, ptr, static_cast<int>(size));
-  } else if (is_readable()) {
-    auto ret = SSL_read(ssl_, ptr, static_cast<int>(size));
-    if (ret < 0) {
-      auto err = SSL_get_error(ssl_, ret);
-      int n = 1000;
-#ifdef _WIN32
-      while (--n >= 0 && (err == SSL_ERROR_WANT_READ ||
-                          (err == SSL_ERROR_SYSCALL &&
-                           WSAGetLastError() == WSAETIMEDOUT))) {
-#else
-      while (--n >= 0 && err == SSL_ERROR_WANT_READ) {
-#endif
-        if (SSL_pending(ssl_) > 0) {
-          return SSL_read(ssl_, ptr, static_cast<int>(size));
-        } else if (is_readable()) {
-          std::this_thread::sleep_for(std::chrono::milliseconds(1));
-          ret = SSL_read(ssl_, ptr, static_cast<int>(size));
-          if (ret >= 0) { return ret; }
-          err = SSL_get_error(ssl_, ret);
-        } else {
-          return -1;
-        }
-      }
-    }
-    return ret;
-  }
-  return -1;
-}
-
-inline ssize_t SSLSocketStream::write(const char *ptr, size_t size) {
-  if (is_writable()) {
-    auto handle_size = static_cast<int>(
-        std::min<size_t>(size, (std::numeric_limits<int>::max)()));
-
-    auto ret = SSL_write(ssl_, ptr, static_cast<int>(handle_size));
-    if (ret < 0) {
-      auto err = SSL_get_error(ssl_, ret);
-      int n = 1000;
-#ifdef _WIN32
-      while (--n >= 0 && (err == SSL_ERROR_WANT_WRITE ||
-                          (err == SSL_ERROR_SYSCALL &&
-                           WSAGetLastError() == WSAETIMEDOUT))) {
-#else
-      while (--n >= 0 && err == SSL_ERROR_WANT_WRITE) {
-#endif
-        if (is_writable()) {
-          std::this_thread::sleep_for(std::chrono::milliseconds(1));
-          ret = SSL_write(ssl_, ptr, static_cast<int>(handle_size));
-          if (ret >= 0) { return ret; }
-          err = SSL_get_error(ssl_, ret);
-        } else {
-          return -1;
-        }
-      }
-    }
-    return ret;
-  }
-  return -1;
-}
-
-inline void SSLSocketStream::get_remote_ip_and_port(std::string &ip,
-                                                    int &port) const {
-  detail::get_remote_ip_and_port(sock_, ip, port);
-}
-
-inline void SSLSocketStream::get_local_ip_and_port(std::string &ip,
-                                                   int &port) const {
-  detail::get_local_ip_and_port(sock_, ip, port);
-}
-
-inline socket_t SSLSocketStream::socket() const { return sock_; }
-
-static SSLInit sslinit_;
-
-} // namespace detail
-
-// SSL HTTP server implementation
-inline SSLServer::SSLServer(const char *cert_path, const char *private_key_path,
-                            const char *client_ca_cert_file_path,
-                            const char *client_ca_cert_dir_path,
-                            const char *private_key_password) {
-  ctx_ = SSL_CTX_new(TLS_server_method());
-
-  if (ctx_) {
-    SSL_CTX_set_options(ctx_,
-                        SSL_OP_NO_COMPRESSION |
-                            SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION);
-
-    SSL_CTX_set_min_proto_version(ctx_, TLS1_1_VERSION);
-
-    // add default password callback before opening encrypted private key
-    if (private_key_password != nullptr && (private_key_password[0] != '\0')) {
-      SSL_CTX_set_default_passwd_cb_userdata(ctx_,
-                                             (char *)private_key_password);
-    }
-
-    if (SSL_CTX_use_certificate_chain_file(ctx_, cert_path) != 1 ||
-        SSL_CTX_use_PrivateKey_file(ctx_, private_key_path, SSL_FILETYPE_PEM) !=
-            1) {
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    } else if (client_ca_cert_file_path || client_ca_cert_dir_path) {
-      SSL_CTX_load_verify_locations(ctx_, client_ca_cert_file_path,
-                                    client_ca_cert_dir_path);
-
-      SSL_CTX_set_verify(
-          ctx_, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, nullptr);
-    }
-  }
-}
-
-inline SSLServer::SSLServer(X509 *cert, EVP_PKEY *private_key,
-                            X509_STORE *client_ca_cert_store) {
-  ctx_ = SSL_CTX_new(TLS_server_method());
-
-  if (ctx_) {
-    SSL_CTX_set_options(ctx_,
-                        SSL_OP_NO_COMPRESSION |
-                            SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION);
-
-    SSL_CTX_set_min_proto_version(ctx_, TLS1_1_VERSION);
-
-    if (SSL_CTX_use_certificate(ctx_, cert) != 1 ||
-        SSL_CTX_use_PrivateKey(ctx_, private_key) != 1) {
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    } else if (client_ca_cert_store) {
-      SSL_CTX_set_cert_store(ctx_, client_ca_cert_store);
-
-      SSL_CTX_set_verify(
-          ctx_, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, nullptr);
-    }
-  }
-}
-
-inline SSLServer::SSLServer(
-    const std::function<bool(SSL_CTX &ssl_ctx)> &setup_ssl_ctx_callback) {
-  ctx_ = SSL_CTX_new(TLS_method());
-  if (ctx_) {
-    if (!setup_ssl_ctx_callback(*ctx_)) {
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    }
-  }
-}
-
-inline SSLServer::~SSLServer() {
-  if (ctx_) { SSL_CTX_free(ctx_); }
-}
-
-inline bool SSLServer::is_valid() const { return ctx_; }
-
-inline SSL_CTX *SSLServer::ssl_context() const { return ctx_; }
-
-inline bool SSLServer::process_and_close_socket(socket_t sock) {
-  auto ssl = detail::ssl_new(
-      sock, ctx_, ctx_mutex_,
-      [&](SSL *ssl2) {
-        return detail::ssl_connect_or_accept_nonblocking(
-            sock, ssl2, SSL_accept, read_timeout_sec_, read_timeout_usec_);
-      },
-      [](SSL * /*ssl2*/) { return true; });
-
-  auto ret = false;
-  if (ssl) {
-    ret = detail::process_server_socket_ssl(
-        svr_sock_, ssl, sock, keep_alive_max_count_, keep_alive_timeout_sec_,
-        read_timeout_sec_, read_timeout_usec_, write_timeout_sec_,
-        write_timeout_usec_,
-        [this, ssl](Stream &strm, bool close_connection,
-                    bool &connection_closed) {
-          return process_request(strm, close_connection, connection_closed,
-                                 [&](Request &req) { req.ssl = ssl; });
-        });
-
-    // Shutdown gracefully if the result seemed successful, non-gracefully if
-    // the connection appeared to be closed.
-    const bool shutdown_gracefully = ret;
-    detail::ssl_delete(ctx_mutex_, ssl, shutdown_gracefully);
-  }
-
-  detail::shutdown_socket(sock);
-  detail::close_socket(sock);
-  return ret;
-}
-
-// SSL HTTP client implementation
-inline SSLClient::SSLClient(const std::string &host)
-    : SSLClient(host, 443, std::string(), std::string()) {}
-
-inline SSLClient::SSLClient(const std::string &host, int port)
-    : SSLClient(host, port, std::string(), std::string()) {}
-
-inline SSLClient::SSLClient(const std::string &host, int port,
-                            const std::string &client_cert_path,
-                            const std::string &client_key_path)
-    : ClientImpl(host, port, client_cert_path, client_key_path) {
-  ctx_ = SSL_CTX_new(TLS_client_method());
-
-  detail::split(&host_[0], &host_[host_.size()], '.',
-                [&](const char *b, const char *e) {
-                  host_components_.emplace_back(std::string(b, e));
-                });
-
-  if (!client_cert_path.empty() && !client_key_path.empty()) {
-    if (SSL_CTX_use_certificate_file(ctx_, client_cert_path.c_str(),
-                                     SSL_FILETYPE_PEM) != 1 ||
-        SSL_CTX_use_PrivateKey_file(ctx_, client_key_path.c_str(),
-                                    SSL_FILETYPE_PEM) != 1) {
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    }
-  }
-}
-
-inline SSLClient::SSLClient(const std::string &host, int port,
-                            X509 *client_cert, EVP_PKEY *client_key)
-    : ClientImpl(host, port) {
-  ctx_ = SSL_CTX_new(TLS_client_method());
-
-  detail::split(&host_[0], &host_[host_.size()], '.',
-                [&](const char *b, const char *e) {
-                  host_components_.emplace_back(std::string(b, e));
-                });
-
-  if (client_cert != nullptr && client_key != nullptr) {
-    if (SSL_CTX_use_certificate(ctx_, client_cert) != 1 ||
-        SSL_CTX_use_PrivateKey(ctx_, client_key) != 1) {
-      SSL_CTX_free(ctx_);
-      ctx_ = nullptr;
-    }
-  }
-}
-
-inline SSLClient::~SSLClient() {
-  if (ctx_) { SSL_CTX_free(ctx_); }
-  // Make sure to shut down SSL since shutdown_ssl will resolve to the
-  // base function rather than the derived function once we get to the
-  // base class destructor, and won't free the SSL (causing a leak).
-  shutdown_ssl_impl(socket_, true);
-}
-
-inline bool SSLClient::is_valid() const { return ctx_; }
-
-inline void SSLClient::set_ca_cert_store(X509_STORE *ca_cert_store) {
-  if (ca_cert_store) {
-    if (ctx_) {
-      if (SSL_CTX_get_cert_store(ctx_) != ca_cert_store) {
-        // Free memory allocated for old cert and use new store `ca_cert_store`
-        SSL_CTX_set_cert_store(ctx_, ca_cert_store);
-      }
-    } else {
-      X509_STORE_free(ca_cert_store);
-    }
-  }
-}
-
-inline long SSLClient::get_openssl_verify_result() const {
-  return verify_result_;
-}
-
-inline SSL_CTX *SSLClient::ssl_context() const { return ctx_; }
-
-inline bool SSLClient::create_and_connect_socket(Socket &socket, Error &error) {
-  return is_valid() && ClientImpl::create_and_connect_socket(socket, error);
-}
-
-// Assumes that socket_mutex_ is locked and that there are no requests in flight
-inline bool SSLClient::connect_with_proxy(Socket &socket, Response &res,
-                                          bool &success, Error &error) {
-  success = true;
-  Response res2;
-  if (!detail::process_client_socket(
-          socket.sock, read_timeout_sec_, read_timeout_usec_,
-          write_timeout_sec_, write_timeout_usec_, [&](Stream &strm) {
-            Request req2;
-            req2.method = "CONNECT";
-            req2.path = host_and_port_;
-            return process_request(strm, req2, res2, false, error);
-          })) {
-    // Thread-safe to close everything because we are assuming there are no
-    // requests in flight
-    shutdown_ssl(socket, true);
-    shutdown_socket(socket);
-    close_socket(socket);
-    success = false;
-    return false;
-  }
-
-  if (res2.status == 407) {
-    if (!proxy_digest_auth_username_.empty() &&
-        !proxy_digest_auth_password_.empty()) {
-      std::map<std::string, std::string> auth;
-      if (detail::parse_www_authenticate(res2, auth, true)) {
-        Response res3;
-        if (!detail::process_client_socket(
-                socket.sock, read_timeout_sec_, read_timeout_usec_,
-                write_timeout_sec_, write_timeout_usec_, [&](Stream &strm) {
-                  Request req3;
-                  req3.method = "CONNECT";
-                  req3.path = host_and_port_;
-                  req3.headers.insert(detail::make_digest_authentication_header(
-                      req3, auth, 1, detail::random_string(10),
-                      proxy_digest_auth_username_, proxy_digest_auth_password_,
-                      true));
-                  return process_request(strm, req3, res3, false, error);
-                })) {
-          // Thread-safe to close everything because we are assuming there are
-          // no requests in flight
-          shutdown_ssl(socket, true);
-          shutdown_socket(socket);
-          close_socket(socket);
-          success = false;
-          return false;
-        }
-      }
-    } else {
-      res = res2;
-      return false;
-    }
-  }
-
-  return true;
-}
-
-inline bool SSLClient::load_certs() {
-  bool ret = true;
-
-  std::call_once(initialize_cert_, [&]() {
-    std::lock_guard<std::mutex> guard(ctx_mutex_);
-    if (!ca_cert_file_path_.empty()) {
-      if (!SSL_CTX_load_verify_locations(ctx_, ca_cert_file_path_.c_str(),
-                                         nullptr)) {
-        ret = false;
-      }
-    } else if (!ca_cert_dir_path_.empty()) {
-      if (!SSL_CTX_load_verify_locations(ctx_, nullptr,
-                                         ca_cert_dir_path_.c_str())) {
-        ret = false;
-      }
-    } else {
-      auto loaded = false;
-#ifdef _WIN32
-      loaded =
-          detail::load_system_certs_on_windows(SSL_CTX_get_cert_store(ctx_));
-#elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && defined(__APPLE__)
-#if TARGET_OS_OSX
-      loaded = detail::load_system_certs_on_macos(SSL_CTX_get_cert_store(ctx_));
-#endif // TARGET_OS_OSX
-#endif // _WIN32
-      if (!loaded) { SSL_CTX_set_default_verify_paths(ctx_); }
-    }
-  });
-
-  return ret;
-}
-
-inline bool SSLClient::initialize_ssl(Socket &socket, Error &error) {
-  auto ssl = detail::ssl_new(
-      socket.sock, ctx_, ctx_mutex_,
-      [&](SSL *ssl2) {
-        if (server_certificate_verification_) {
-          if (!load_certs()) {
-            error = Error::SSLLoadingCerts;
-            return false;
-          }
-          SSL_set_verify(ssl2, SSL_VERIFY_NONE, nullptr);
-        }
-
-        if (!detail::ssl_connect_or_accept_nonblocking(
-                socket.sock, ssl2, SSL_connect, connection_timeout_sec_,
-                connection_timeout_usec_)) {
-          error = Error::SSLConnection;
-          return false;
-        }
-
-        if (server_certificate_verification_) {
-          verify_result_ = SSL_get_verify_result(ssl2);
-
-          if (verify_result_ != X509_V_OK) {
-            error = Error::SSLServerVerification;
-            return false;
-          }
-
-          auto server_cert = SSL_get1_peer_certificate(ssl2);
-
-          if (server_cert == nullptr) {
-            error = Error::SSLServerVerification;
-            return false;
-          }
-
-          if (!verify_host(server_cert)) {
-            X509_free(server_cert);
-            error = Error::SSLServerVerification;
-            return false;
-          }
-          X509_free(server_cert);
-        }
-
-        return true;
-      },
-      [&](SSL *ssl2) {
-        SSL_set_tlsext_host_name(ssl2, host_.c_str());
-        return true;
-      });
-
-  if (ssl) {
-    socket.ssl = ssl;
-    return true;
-  }
-
-  shutdown_socket(socket);
-  close_socket(socket);
-  return false;
-}
-
-inline void SSLClient::shutdown_ssl(Socket &socket, bool shutdown_gracefully) {
-  shutdown_ssl_impl(socket, shutdown_gracefully);
-}
-
-inline void SSLClient::shutdown_ssl_impl(Socket &socket,
-                                         bool shutdown_gracefully) {
-  if (socket.sock == INVALID_SOCKET) {
-    assert(socket.ssl == nullptr);
-    return;
-  }
-  if (socket.ssl) {
-    detail::ssl_delete(ctx_mutex_, socket.ssl, shutdown_gracefully);
-    socket.ssl = nullptr;
-  }
-  assert(socket.ssl == nullptr);
-}
-
-inline bool
-SSLClient::process_socket(const Socket &socket,
-                          std::function<bool(Stream &strm)> callback) {
-  assert(socket.ssl);
-  return detail::process_client_socket_ssl(
-      socket.ssl, socket.sock, read_timeout_sec_, read_timeout_usec_,
-      write_timeout_sec_, write_timeout_usec_, std::move(callback));
-}
-
-inline bool SSLClient::is_ssl() const { return true; }
-
-inline bool SSLClient::verify_host(X509 *server_cert) const {
-  /* Quote from RFC2818 section 3.1 "Server Identity"
-
-     If a subjectAltName extension of type dNSName is present, that MUST
-     be used as the identity. Otherwise, the (most specific) Common Name
-     field in the Subject field of the certificate MUST be used. Although
-     the use of the Common Name is existing practice, it is deprecated and
-     Certification Authorities are encouraged to use the dNSName instead.
-
-     Matching is performed using the matching rules specified by
-     [RFC2459].  If more than one identity of a given type is present in
-     the certificate (e.g., more than one dNSName name, a match in any one
-     of the set is considered acceptable.) Names may contain the wildcard
-     character * which is considered to match any single domain name
-     component or component fragment. E.g., *.a.com matches foo.a.com but
-     not bar.foo.a.com. f*.com matches foo.com but not bar.com.
-
-     In some cases, the URI is specified as an IP address rather than a
-     hostname. In this case, the iPAddress subjectAltName must be present
-     in the certificate and must exactly match the IP in the URI.
-
-  */
-  return verify_host_with_subject_alt_name(server_cert) ||
-         verify_host_with_common_name(server_cert);
-}
-
-inline bool
-SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const {
-  auto ret = false;
-
-  auto type = GEN_DNS;
-
-  struct in6_addr addr6;
-  struct in_addr addr;
-  size_t addr_len = 0;
-
-#ifndef __MINGW32__
-  if (inet_pton(AF_INET6, host_.c_str(), &addr6)) {
-    type = GEN_IPADD;
-    addr_len = sizeof(struct in6_addr);
-  } else if (inet_pton(AF_INET, host_.c_str(), &addr)) {
-    type = GEN_IPADD;
-    addr_len = sizeof(struct in_addr);
-  }
-#endif
-
-  auto alt_names = static_cast<const struct stack_st_GENERAL_NAME *>(
-      X509_get_ext_d2i(server_cert, NID_subject_alt_name, nullptr, nullptr));
-
-  if (alt_names) {
-    auto dsn_matched = false;
-    auto ip_matched = false;
-
-    auto count = sk_GENERAL_NAME_num(alt_names);
-
-    for (decltype(count) i = 0; i < count && !dsn_matched; i++) {
-      auto val = sk_GENERAL_NAME_value(alt_names, i);
-      if (val->type == type) {
-        auto name = (const char *)ASN1_STRING_get0_data(val->d.ia5);
-        auto name_len = (size_t)ASN1_STRING_length(val->d.ia5);
-
-        switch (type) {
-        case GEN_DNS: dsn_matched = check_host_name(name, name_len); break;
-
-        case GEN_IPADD:
-          if (!memcmp(&addr6, name, addr_len) ||
-              !memcmp(&addr, name, addr_len)) {
-            ip_matched = true;
-          }
-          break;
-        }
-      }
-    }
-
-    if (dsn_matched || ip_matched) { ret = true; }
-  }
-
-  GENERAL_NAMES_free((STACK_OF(GENERAL_NAME) *)alt_names);
-  return ret;
-}
-
-inline bool SSLClient::verify_host_with_common_name(X509 *server_cert) const {
-  const auto subject_name = X509_get_subject_name(server_cert);
-
-  if (subject_name != nullptr) {
-    char name[BUFSIZ];
-    auto name_len = X509_NAME_get_text_by_NID(subject_name, NID_commonName,
-                                              name, sizeof(name));
-
-    if (name_len != -1) {
-      return check_host_name(name, static_cast<size_t>(name_len));
-    }
-  }
-
-  return false;
-}
-
-inline bool SSLClient::check_host_name(const char *pattern,
-                                       size_t pattern_len) const {
-  if (host_.size() == pattern_len && host_ == pattern) { return true; }
-
-  // Wildcard match
-  // https://bugs.launchpad.net/ubuntu/+source/firefox-3.0/+bug/376484
-  std::vector<std::string> pattern_components;
-  detail::split(&pattern[0], &pattern[pattern_len], '.',
-                [&](const char *b, const char *e) {
-                  pattern_components.emplace_back(std::string(b, e));
-                });
-
-  if (host_components_.size() != pattern_components.size()) { return false; }
-
-  auto itr = pattern_components.begin();
-  for (const auto &h : host_components_) {
-    auto &p = *itr;
-    if (p != h && p != "*") {
-      auto partial_match = (p.size() > 0 && p[p.size() - 1] == '*' &&
-                            !p.compare(0, p.size() - 1, h));
-      if (!partial_match) { return false; }
-    }
-    ++itr;
-  }
-
-  return true;
-}
-#endif
-
-// Universal client implementation
-inline Client::Client(const std::string &scheme_host_port)
-    : Client(scheme_host_port, std::string(), std::string()) {}
-
-inline Client::Client(const std::string &scheme_host_port,
-                      const std::string &client_cert_path,
-                      const std::string &client_key_path) {
-  const static std::regex re(
-      R"((?:([a-z]+):\/\/)?(?:\[([\d:]+)\]|([^:/?#]+))(?::(\d+))?)");
-
-  std::smatch m;
-  if (std::regex_match(scheme_host_port, m, re)) {
-    auto scheme = m[1].str();
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    if (!scheme.empty() && (scheme != "http" && scheme != "https")) {
-#else
-    if (!scheme.empty() && scheme != "http") {
-#endif
-#ifndef CPPHTTPLIB_NO_EXCEPTIONS
-      std::string msg = "'" + scheme + "' scheme is not supported.";
-      throw std::invalid_argument(msg);
-#endif
-      return;
-    }
-
-    auto is_ssl = scheme == "https";
-
-    auto host = m[2].str();
-    if (host.empty()) { host = m[3].str(); }
-
-    auto port_str = m[4].str();
-    auto port = !port_str.empty() ? std::stoi(port_str) : (is_ssl ? 443 : 80);
-
-    if (is_ssl) {
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-      cli_ = detail::make_unique<SSLClient>(host, port, client_cert_path,
-                                            client_key_path);
-      is_ssl_ = is_ssl;
-#endif
-    } else {
-      cli_ = detail::make_unique<ClientImpl>(host, port, client_cert_path,
-                                             client_key_path);
-    }
-  } else {
-    cli_ = detail::make_unique<ClientImpl>(scheme_host_port, 80,
-                                           client_cert_path, client_key_path);
-  }
-}
-
-inline Client::Client(const std::string &host, int port)
-    : cli_(detail::make_unique<ClientImpl>(host, port)) {}
-
-inline Client::Client(const std::string &host, int port,
-                      const std::string &client_cert_path,
-                      const std::string &client_key_path)
-    : cli_(detail::make_unique<ClientImpl>(host, port, client_cert_path,
-                                           client_key_path)) {}
-
-inline Client::~Client() {}
-
-inline bool Client::is_valid() const {
-  return cli_ != nullptr && cli_->is_valid();
-}
-
-inline Result Client::Get(const std::string &path) { return cli_->Get(path); }
-inline Result Client::Get(const std::string &path, const Headers &headers) {
-  return cli_->Get(path, headers);
-}
-inline Result Client::Get(const std::string &path, Progress progress) {
-  return cli_->Get(path, std::move(progress));
-}
-inline Result Client::Get(const std::string &path, const Headers &headers,
-                          Progress progress) {
-  return cli_->Get(path, headers, std::move(progress));
-}
-inline Result Client::Get(const std::string &path,
-                          ContentReceiver content_receiver) {
-  return cli_->Get(path, std::move(content_receiver));
-}
-inline Result Client::Get(const std::string &path, const Headers &headers,
-                          ContentReceiver content_receiver) {
-  return cli_->Get(path, headers, std::move(content_receiver));
-}
-inline Result Client::Get(const std::string &path,
-                          ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, std::move(content_receiver), std::move(progress));
-}
-inline Result Client::Get(const std::string &path, const Headers &headers,
-                          ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, headers, std::move(content_receiver),
-                   std::move(progress));
-}
-inline Result Client::Get(const std::string &path,
-                          ResponseHandler response_handler,
-                          ContentReceiver content_receiver) {
-  return cli_->Get(path, std::move(response_handler),
-                   std::move(content_receiver));
-}
-inline Result Client::Get(const std::string &path, const Headers &headers,
-                          ResponseHandler response_handler,
-                          ContentReceiver content_receiver) {
-  return cli_->Get(path, headers, std::move(response_handler),
-                   std::move(content_receiver));
-}
-inline Result Client::Get(const std::string &path,
-                          ResponseHandler response_handler,
-                          ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, std::move(response_handler),
-                   std::move(content_receiver), std::move(progress));
-}
-inline Result Client::Get(const std::string &path, const Headers &headers,
-                          ResponseHandler response_handler,
-                          ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, headers, std::move(response_handler),
-                   std::move(content_receiver), std::move(progress));
-}
-inline Result Client::Get(const std::string &path, const Params &params,
-                          const Headers &headers, Progress progress) {
-  return cli_->Get(path, params, headers, progress);
-}
-inline Result Client::Get(const std::string &path, const Params &params,
-                          const Headers &headers,
-                          ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, params, headers, content_receiver, progress);
-}
-inline Result Client::Get(const std::string &path, const Params &params,
-                          const Headers &headers,
-                          ResponseHandler response_handler,
-                          ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, params, headers, response_handler, content_receiver,
-                   progress);
-}
-
-inline Result Client::Head(const std::string &path) { return cli_->Head(path); }
-inline Result Client::Head(const std::string &path, const Headers &headers) {
-  return cli_->Head(path, headers);
-}
-
-inline Result Client::Post(const std::string &path) { return cli_->Post(path); }
-inline Result Client::Post(const std::string &path, const Headers &headers) {
-  return cli_->Post(path, headers);
-}
-inline Result Client::Post(const std::string &path, const char *body,
-                           size_t content_length,
-                           const std::string &content_type) {
-  return cli_->Post(path, body, content_length, content_type);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const char *body, size_t content_length,
-                           const std::string &content_type) {
-  return cli_->Post(path, headers, body, content_length, content_type);
-}
-inline Result Client::Post(const std::string &path, const std::string &body,
-                           const std::string &content_type) {
-  return cli_->Post(path, body, content_type);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const std::string &body,
-                           const std::string &content_type) {
-  return cli_->Post(path, headers, body, content_type);
-}
-inline Result Client::Post(const std::string &path, size_t content_length,
-                           ContentProvider content_provider,
-                           const std::string &content_type) {
-  return cli_->Post(path, content_length, std::move(content_provider),
-                    content_type);
-}
-inline Result Client::Post(const std::string &path,
-                           ContentProviderWithoutLength content_provider,
-                           const std::string &content_type) {
-  return cli_->Post(path, std::move(content_provider), content_type);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           size_t content_length,
-                           ContentProvider content_provider,
-                           const std::string &content_type) {
-  return cli_->Post(path, headers, content_length, std::move(content_provider),
-                    content_type);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           ContentProviderWithoutLength content_provider,
-                           const std::string &content_type) {
-  return cli_->Post(path, headers, std::move(content_provider), content_type);
-}
-inline Result Client::Post(const std::string &path, const Params &params) {
-  return cli_->Post(path, params);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const Params &params) {
-  return cli_->Post(path, headers, params);
-}
-inline Result Client::Post(const std::string &path,
-                           const MultipartFormDataItems &items) {
-  return cli_->Post(path, items);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const MultipartFormDataItems &items) {
-  return cli_->Post(path, headers, items);
-}
-inline Result Client::Post(const std::string &path, const Headers &headers,
-                           const MultipartFormDataItems &items,
-                           const std::string &boundary) {
-  return cli_->Post(path, headers, items, boundary);
-}
-inline Result
-Client::Post(const std::string &path, const Headers &headers,
-             const MultipartFormDataItems &items,
-             const MultipartFormDataProviderItems &provider_items) {
-  return cli_->Post(path, headers, items, provider_items);
-}
-inline Result Client::Put(const std::string &path) { return cli_->Put(path); }
-inline Result Client::Put(const std::string &path, const char *body,
-                          size_t content_length,
-                          const std::string &content_type) {
-  return cli_->Put(path, body, content_length, content_type);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const char *body, size_t content_length,
-                          const std::string &content_type) {
-  return cli_->Put(path, headers, body, content_length, content_type);
-}
-inline Result Client::Put(const std::string &path, const std::string &body,
-                          const std::string &content_type) {
-  return cli_->Put(path, body, content_type);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const std::string &body,
-                          const std::string &content_type) {
-  return cli_->Put(path, headers, body, content_type);
-}
-inline Result Client::Put(const std::string &path, size_t content_length,
-                          ContentProvider content_provider,
-                          const std::string &content_type) {
-  return cli_->Put(path, content_length, std::move(content_provider),
-                   content_type);
-}
-inline Result Client::Put(const std::string &path,
-                          ContentProviderWithoutLength content_provider,
-                          const std::string &content_type) {
-  return cli_->Put(path, std::move(content_provider), content_type);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          size_t content_length,
-                          ContentProvider content_provider,
-                          const std::string &content_type) {
-  return cli_->Put(path, headers, content_length, std::move(content_provider),
-                   content_type);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          ContentProviderWithoutLength content_provider,
-                          const std::string &content_type) {
-  return cli_->Put(path, headers, std::move(content_provider), content_type);
-}
-inline Result Client::Put(const std::string &path, const Params &params) {
-  return cli_->Put(path, params);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const Params &params) {
-  return cli_->Put(path, headers, params);
-}
-inline Result Client::Put(const std::string &path,
-                          const MultipartFormDataItems &items) {
-  return cli_->Put(path, items);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const MultipartFormDataItems &items) {
-  return cli_->Put(path, headers, items);
-}
-inline Result Client::Put(const std::string &path, const Headers &headers,
-                          const MultipartFormDataItems &items,
-                          const std::string &boundary) {
-  return cli_->Put(path, headers, items, boundary);
-}
-inline Result
-Client::Put(const std::string &path, const Headers &headers,
-            const MultipartFormDataItems &items,
-            const MultipartFormDataProviderItems &provider_items) {
-  return cli_->Put(path, headers, items, provider_items);
-}
-inline Result Client::Patch(const std::string &path) {
-  return cli_->Patch(path);
-}
-inline Result Client::Patch(const std::string &path, const char *body,
-                            size_t content_length,
-                            const std::string &content_type) {
-  return cli_->Patch(path, body, content_length, content_type);
-}
-inline Result Client::Patch(const std::string &path, const Headers &headers,
-                            const char *body, size_t content_length,
-                            const std::string &content_type) {
-  return cli_->Patch(path, headers, body, content_length, content_type);
-}
-inline Result Client::Patch(const std::string &path, const std::string &body,
-                            const std::string &content_type) {
-  return cli_->Patch(path, body, content_type);
-}
-inline Result Client::Patch(const std::string &path, const Headers &headers,
-                            const std::string &body,
-                            const std::string &content_type) {
-  return cli_->Patch(path, headers, body, content_type);
-}
-inline Result Client::Patch(const std::string &path, size_t content_length,
-                            ContentProvider content_provider,
-                            const std::string &content_type) {
-  return cli_->Patch(path, content_length, std::move(content_provider),
-                     content_type);
-}
-inline Result Client::Patch(const std::string &path,
-                            ContentProviderWithoutLength content_provider,
-                            const std::string &content_type) {
-  return cli_->Patch(path, std::move(content_provider), content_type);
-}
-inline Result Client::Patch(const std::string &path, const Headers &headers,
-                            size_t content_length,
-                            ContentProvider content_provider,
-                            const std::string &content_type) {
-  return cli_->Patch(path, headers, content_length, std::move(content_provider),
-                     content_type);
-}
-inline Result Client::Patch(const std::string &path, const Headers &headers,
-                            ContentProviderWithoutLength content_provider,
-                            const std::string &content_type) {
-  return cli_->Patch(path, headers, std::move(content_provider), content_type);
-}
-inline Result Client::Delete(const std::string &path) {
-  return cli_->Delete(path);
-}
-inline Result Client::Delete(const std::string &path, const Headers &headers) {
-  return cli_->Delete(path, headers);
-}
-inline Result Client::Delete(const std::string &path, const char *body,
-                             size_t content_length,
-                             const std::string &content_type) {
-  return cli_->Delete(path, body, content_length, content_type);
-}
-inline Result Client::Delete(const std::string &path, const Headers &headers,
-                             const char *body, size_t content_length,
-                             const std::string &content_type) {
-  return cli_->Delete(path, headers, body, content_length, content_type);
-}
-inline Result Client::Delete(const std::string &path, const std::string &body,
-                             const std::string &content_type) {
-  return cli_->Delete(path, body, content_type);
-}
-inline Result Client::Delete(const std::string &path, const Headers &headers,
-                             const std::string &body,
-                             const std::string &content_type) {
-  return cli_->Delete(path, headers, body, content_type);
-}
-inline Result Client::Options(const std::string &path) {
-  return cli_->Options(path);
-}
-inline Result Client::Options(const std::string &path, const Headers &headers) {
-  return cli_->Options(path, headers);
-}
-
-inline bool Client::send(Request &req, Response &res, Error &error) {
-  return cli_->send(req, res, error);
-}
-
-inline Result Client::send(const Request &req) { return cli_->send(req); }
-
-inline size_t Client::is_socket_open() const { return cli_->is_socket_open(); }
-
-inline socket_t Client::socket() const { return cli_->socket(); }
-
-inline void Client::stop() { cli_->stop(); }
-
-inline void
-Client::set_hostname_addr_map(std::map<std::string, std::string> addr_map) {
-  cli_->set_hostname_addr_map(std::move(addr_map));
-}
-
-inline void Client::set_default_headers(Headers headers) {
-  cli_->set_default_headers(std::move(headers));
-}
-
-inline void Client::set_address_family(int family) {
-  cli_->set_address_family(family);
-}
-
-inline void Client::set_tcp_nodelay(bool on) { cli_->set_tcp_nodelay(on); }
-
-inline void Client::set_socket_options(SocketOptions socket_options) {
-  cli_->set_socket_options(std::move(socket_options));
-}
-
-inline void Client::set_connection_timeout(time_t sec, time_t usec) {
-  cli_->set_connection_timeout(sec, usec);
-}
-
-inline void Client::set_read_timeout(time_t sec, time_t usec) {
-  cli_->set_read_timeout(sec, usec);
-}
-
-inline void Client::set_write_timeout(time_t sec, time_t usec) {
-  cli_->set_write_timeout(sec, usec);
-}
-
-inline void Client::set_basic_auth(const std::string &username,
-                                   const std::string &password) {
-  cli_->set_basic_auth(username, password);
-}
-inline void Client::set_bearer_token_auth(const std::string &token) {
-  cli_->set_bearer_token_auth(token);
-}
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void Client::set_digest_auth(const std::string &username,
-                                    const std::string &password) {
-  cli_->set_digest_auth(username, password);
-}
-#endif
-
-inline void Client::set_keep_alive(bool on) { cli_->set_keep_alive(on); }
-inline void Client::set_follow_location(bool on) {
-  cli_->set_follow_location(on);
-}
-
-inline void Client::set_url_encode(bool on) { cli_->set_url_encode(on); }
-
-inline void Client::set_compress(bool on) { cli_->set_compress(on); }
-
-inline void Client::set_decompress(bool on) { cli_->set_decompress(on); }
-
-inline void Client::set_interface(const std::string &intf) {
-  cli_->set_interface(intf);
-}
-
-inline void Client::set_proxy(const std::string &host, int port) {
-  cli_->set_proxy(host, port);
-}
-inline void Client::set_proxy_basic_auth(const std::string &username,
-                                         const std::string &password) {
-  cli_->set_proxy_basic_auth(username, password);
-}
-inline void Client::set_proxy_bearer_token_auth(const std::string &token) {
-  cli_->set_proxy_bearer_token_auth(token);
-}
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void Client::set_proxy_digest_auth(const std::string &username,
-                                          const std::string &password) {
-  cli_->set_proxy_digest_auth(username, password);
-}
-#endif
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void Client::enable_server_certificate_verification(bool enabled) {
-  cli_->enable_server_certificate_verification(enabled);
-}
-#endif
-
-inline void Client::set_logger(Logger logger) { cli_->set_logger(logger); }
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-inline void Client::set_ca_cert_path(const std::string &ca_cert_file_path,
-                                     const std::string &ca_cert_dir_path) {
-  cli_->set_ca_cert_path(ca_cert_file_path, ca_cert_dir_path);
-}
-
-inline void Client::set_ca_cert_store(X509_STORE *ca_cert_store) {
-  if (is_ssl_) {
-    static_cast<SSLClient &>(*cli_).set_ca_cert_store(ca_cert_store);
-  } else {
-    cli_->set_ca_cert_store(ca_cert_store);
-  }
-}
-
-inline long Client::get_openssl_verify_result() const {
-  if (is_ssl_) {
-    return static_cast<SSLClient &>(*cli_).get_openssl_verify_result();
-  }
-  return -1; // NOTE: -1 doesn't match any of X509_V_ERR_???
-}
-
-inline SSL_CTX *Client::ssl_context() const {
-  if (is_ssl_) { return static_cast<SSLClient &>(*cli_).ssl_context(); }
-  return nullptr;
-}
-#endif
-
-// ----------------------------------------------------------------------------
-
-} // namespace httplib
-
-#if defined(_WIN32) && defined(CPPHTTPLIB_USE_POLL)
-#undef poll
-#endif
-
-#endif // CPPHTTPLIB_HTTPLIB_H
diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
deleted file mode 100644
index a48631ff..00000000
--- a/llm/ext_server/server.cpp
+++ /dev/null
@@ -1,3227 +0,0 @@
-// MIT License
-
-// Copyright (c) 2023 Georgi Gerganov
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#include "common.h"
-#include "llama.h"
-#include "log.h"
-#include "sampling.h"
-#include "utils.hpp"
-
-#include "../llava/clip.h"
-#include "../llava/llava.h"
-
-#include "stb_image.h"
-
-#ifndef NDEBUG
-// crash the server in debug mode, otherwise send an http 500 error
-#define CPPHTTPLIB_NO_EXCEPTIONS 1
-#endif
-// increase max payload length to allow use of larger context size
-#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
-#include "httplib.h"
-#include "json.hpp"
-
-#if defined(_WIN32)
-#include <windows.h>
-#include <errhandlingapi.h>
-#endif
-
-#include <algorithm>
-#include <cstddef>
-#include <thread>
-#include <chrono>
-#include <condition_variable>
-#include <atomic>
-#include <signal.h>
-
-using json = nlohmann::json;
-
-struct server_params {
-    std::string hostname = "127.0.0.1";
-    std::vector<std::string> api_keys;
-    std::string public_path = "examples/server/public";
-    int32_t port = 8080;
-    int32_t read_timeout = 600;
-    int32_t write_timeout = 600;
-    bool slots_endpoint = true;
-    bool metrics_endpoint = false;
-    int n_threads_http = -1;
-};
-
-bool server_verbose = false;
-bool server_log_json = false;
-
-enum stop_type {
-    STOP_FULL,
-    STOP_PARTIAL,
-};
-
-// TODO: can become bool if we can't find use of more states
-enum slot_state {
-    IDLE,
-    PROCESSING,
-};
-
-enum slot_command {
-    NONE,
-    LOAD_PROMPT,
-    RELEASE,
-};
-
-struct slot_params {
-    bool stream       = true;
-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
-
-    uint32_t seed      = -1; // RNG seed
-    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t  n_predict = -1; // new tokens to predict
-
-    std::vector<std::string> antiprompt;
-
-    json input_prefix;
-    json input_suffix;
-};
-
-struct slot_image {
-    int32_t id;
-
-    bool request_encode_image = false;
-    float * image_embedding = nullptr;
-    int32_t image_tokens = 0;
-
-    clip_image_u8 * img_data;
-
-    std::string prefix_prompt; // before of this image
-};
-
-struct server_slot {
-    int id;
-    int task_id = -1;
-
-    struct slot_params params;
-
-    slot_state state = IDLE;
-    slot_command command = NONE;
-
-    // used to determine the slot that has been used the longest
-    int64_t t_last_used = -1;
-
-    // generation props
-    int32_t n_ctx       = 0;  // context size per slot
-    int32_t n_past      = 0;
-    int32_t n_decoded   = 0;
-    int32_t n_remaining = -1;
-    int32_t i_batch     = -1;
-    int32_t n_predict   = -1;
-
-    int32_t n_prompt_tokens           = 0;
-    int32_t n_prompt_tokens_processed = 0;
-
-    json prompt;
-    std::string generated_text;
-    std::vector<llama_token> cache_tokens;
-    std::vector<completion_token_output> generated_token_probs;
-
-    bool embedding = false;
-    bool has_next_token = true;
-    bool truncated = false;
-    bool stopped_eos = false;
-    bool stopped_word = false;
-    bool stopped_limit = false;
-
-    std::string stopping_word;
-
-    // sampling
-    struct gpt_sampler_params sparams;
-    struct gpt_sampler * smpl = nullptr;
-    llama_token sampled;
-
-    int32_t ga_i = 0;   // group-attention state
-    int32_t ga_n = 1;   // group-attention factor
-    int32_t ga_w = 512; // group-attention width
-
-    int32_t n_past_se = 0; // self-extend
-
-    // multimodal
-    std::vector<slot_image> images;
-
-    // stats
-    size_t n_sent_text = 0; // number of sent text character
-    size_t n_sent_token_probs = 0;
-
-    int64_t t_start_process_prompt;
-    int64_t t_start_genereration;
-
-    double t_prompt_processing; // ms
-    double t_token_generation; // ms
-
-    // multitasks
-    int multitask_id = -1;
-
-    void reset() {
-        n_prompt_tokens        = 0;
-        generated_text         = "";
-        truncated              = false;
-        stopped_eos            = false;
-        stopped_word           = false;
-        stopped_limit          = false;
-        stopping_word          = "";
-        n_past                 = 0;
-        n_sent_text            = 0;
-        n_sent_token_probs     = 0;
-        ga_i                   = 0;
-        n_past_se              = 0;
-
-        generated_token_probs.clear();
-
-        for (slot_image & img : images) {
-            free(img.image_embedding);
-            if (img.img_data) {
-                clip_image_u8_free(img.img_data);
-            }
-            img.prefix_prompt = "";
-        }
-
-        images.clear();
-    }
-
-    bool has_budget(gpt_params &global_params) {
-        if (params.n_predict == -1 && global_params.n_predict == -1) {
-            return true; // limitless
-        }
-
-        n_remaining = -1;
-
-        if (params.n_predict != -1) {
-            n_remaining = params.n_predict - n_decoded;
-        } else if (global_params.n_predict != -1) {
-            n_remaining = global_params.n_predict - n_decoded;
-        }
-
-        return n_remaining > 0; // no budget
-    }
-
-    bool available() const {
-        return state == IDLE && command == NONE;
-    }
-
-    bool is_processing() const {
-        return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
-    }
-
-    void add_token_string(const completion_token_output &token) {
-        if (command == RELEASE) {
-            return;
-        }
-        cache_tokens.push_back(token.tok);
-        generated_token_probs.push_back(token);
-    }
-
-    void release() {
-        if (state == PROCESSING)
-        {
-            t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
-            command = RELEASE;
-        }
-    }
-
-    json get_formated_timings() {
-        return json
-        {
-            {"prompt_n",               n_prompt_tokens_processed},
-            {"prompt_ms",              t_prompt_processing},
-            {"prompt_per_token_ms",    t_prompt_processing / n_prompt_tokens_processed},
-            {"prompt_per_second",      1e3 / t_prompt_processing * n_prompt_tokens_processed},
-
-            {"predicted_n",            n_decoded},
-            {"predicted_ms",           t_token_generation},
-            {"predicted_per_token_ms", t_token_generation / n_decoded},
-            {"predicted_per_second",   1e3 / t_token_generation * n_decoded},
-        };
-    }
-
-    void print_timings() const {
-       char buffer[512];
-        double t_token = t_prompt_processing / n_prompt_tokens_processed;
-        double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
-        snprintf(buffer, sizeof(buffer), "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
-                t_prompt_processing, n_prompt_tokens_processed,
-                t_token, n_tokens_second);
-        LOG_DEBUG(buffer, {
-            {"slot_id",                   id},
-            {"task_id",                   task_id},
-            {"t_prompt_processing",       t_prompt_processing},
-            {"n_prompt_tokens_processed", n_prompt_tokens_processed},
-            {"t_token",                   t_token},
-            {"n_tokens_second",           n_tokens_second},
-        });
-
-        t_token = t_token_generation / n_decoded;
-        n_tokens_second = 1e3 / t_token_generation * n_decoded;
-        snprintf(buffer, sizeof(buffer), "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
-                t_token_generation, n_decoded,
-                t_token, n_tokens_second);
-        LOG_DEBUG(buffer, {
-            {"slot_id",            id},
-            {"task_id",            task_id},
-            {"t_token_generation", t_token_generation},
-            {"n_decoded",          n_decoded},
-            {"t_token",            t_token},
-            {"n_tokens_second",    n_tokens_second},
-        });
-
-        snprintf(buffer, sizeof(buffer), "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
-        LOG_DEBUG(buffer, {
-            {"slot_id",             id},
-            {"task_id",             task_id},
-            {"t_prompt_processing", t_prompt_processing},
-            {"t_token_generation",  t_token_generation},
-            {"t_total",             t_prompt_processing + t_token_generation},
-        });
-    }
-};
-
-struct server_metrics {
-    uint64_t n_prompt_tokens_processed_total = 0;
-    uint64_t n_tokens_predicted_total        = 0;
-
-    uint64_t n_prompt_tokens_processed = 0;
-    uint64_t t_prompt_processing       = 0;
-
-    uint64_t n_tokens_predicted       = 0;
-    uint64_t t_tokens_generation      = 0;
-
-
-    void on_prompt_eval(const server_slot &slot) {
-        n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
-        n_prompt_tokens_processed       += slot.n_prompt_tokens_processed;
-        t_prompt_processing             += slot.t_prompt_processing;
-    }
-
-    void on_prediction(const server_slot &slot) {
-        n_tokens_predicted_total += slot.n_decoded;
-        n_tokens_predicted       += slot.n_decoded;
-        t_tokens_generation      += slot.t_token_generation;
-    }
-
-    void reset_bucket() {
-        n_prompt_tokens_processed = 0;
-        t_prompt_processing       = 0;
-        n_tokens_predicted        = 0;
-        t_tokens_generation       = 0;
-    }
-};
-
-struct llama_server_context
-{
-    llama_model *model = nullptr;
-    float modelProgress = 0.0;
-    llama_context *ctx = nullptr;
-
-    clip_ctx *clp_ctx = nullptr;
-
-    gpt_params params;
-
-    llama_batch batch;
-
-    bool multimodal         = false;
-    bool clean_kv_cache     = true;
-    bool all_slots_are_idle = false;
-    bool add_bos_token      = true;
-
-    int32_t n_ctx;  // total context for all clients / slots
-
-    // system prompt
-    bool system_need_update = false;
-
-    std::string              system_prompt;
-    std::vector<llama_token> system_tokens;
-
-    std::string name_user;      // this should be the antiprompt
-    std::string name_assistant;
-
-    // slots / clients
-    std::vector<server_slot> slots;
-
-    llama_server_queue    queue_tasks;
-    llama_server_response queue_results;
-
-    server_metrics metrics;
-
-    ~llama_server_context()
-    {
-        if (clp_ctx)
-        {
-            LOG_DEBUG("freeing clip model", {});
-            clip_free(clp_ctx);
-            clp_ctx = nullptr;
-        }
-        if (ctx)
-        {
-            llama_free(ctx);
-            ctx = nullptr;
-        }
-        if (model)
-        {
-            llama_free_model(model);
-            model = nullptr;
-        }
-    }
-
-    bool load_model(const gpt_params &params_)
-    {
-        params = params_;
-        if (!params.mmproj.empty()) {
-            multimodal = true;
-            LOG_DEBUG("Multi Modal Mode Enabled", {});
-            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
-            if(clp_ctx == nullptr) {
-                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
-                return false;
-            }
-
-            if (params.n_ctx < 2048) { // request larger context for the image embedding
-                params.n_ctx = 2048;
-            }
-        }
-
-        auto init_result = llama_init_from_gpt_params(params);
-        model = init_result.model;
-        ctx = init_result.context;
-        if (model == nullptr)
-        {
-            LOG_ERROR("unable to load model", {{"model", params.model}});
-            return false;
-        }
-
-        if (multimodal) {
-            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
-            const int n_embd_llm  = llama_n_embd(model);
-            if (n_embd_clip != n_embd_llm) {
-                LOG_WRN("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
-                llama_free(ctx);
-                llama_free_model(model);
-                return false;
-            }
-        }
-
-        n_ctx = llama_n_ctx(ctx);
-
-        add_bos_token = llama_add_bos_token(model);
-
-        return true;
-    }
-
-    void initialize() {
-        // create slots
-        all_slots_are_idle = true;
-
-        const int32_t n_ctx_slot = n_ctx / params.n_parallel;
-
-        LOG_DEBUG("initializing slots", {{"n_slots", params.n_parallel}});
-        for (int i = 0; i < params.n_parallel; i++)
-        {
-            server_slot slot;
-
-            slot.id = i;
-            slot.n_ctx = n_ctx_slot;
-            slot.n_predict = params.n_predict;
-
-            LOG_DEBUG("new slot", {
-                {"slot_id",    slot.id},
-                {"n_ctx_slot", slot.n_ctx}
-            });
-
-            const int ga_n = params.grp_attn_n;
-            const int ga_w = params.grp_attn_w;
-
-            if (ga_n != 1) {
-                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                       // NOLINT
-                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");             // NOLINT
-                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
-                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
-
-                LOG_DEBUG("slot self-extend", {
-                    {"slot_id",   slot.id},
-                    {"ga_n",      ga_n},
-                    {"ga_w",      ga_w}
-                });
-            }
-
-            slot.ga_i = 0;
-            slot.ga_n = ga_n;
-            slot.ga_w = ga_w;
-
-            slot.reset();
-
-            slots.push_back(slot);
-        }
-
-        batch = llama_batch_init(n_ctx, 0, params.n_parallel);
-    }
-
-    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
-    {
-        // TODO: currently, we tokenize using special tokens by default
-        //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
-        //       but it's better compared to completely ignoring ChatML and other chat templates
-        const bool TMP_FORCE_SPECIAL = true;
-
-        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
-        // or the first element of the json_prompt array is a string.
-        std::vector<llama_token> prompt_tokens;
-
-        if (json_prompt.is_array())
-        {
-            bool first = true;
-            for (const auto& p : json_prompt)
-            {
-                if (p.is_string())
-                {
-                    auto s = p.template get<std::string>();
-                    std::vector<llama_token> p;
-                    if (first)
-                    {
-                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
-                        first = false;
-                    }
-                    else
-                    {
-                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
-                    }
-                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
-                }
-                else
-                {
-                    if (first)
-                    {
-                        first = false;
-                    }
-                    prompt_tokens.push_back(p.template get<llama_token>());
-                }
-            }
-        }
-        else
-        {
-            auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
-        }
-
-        return prompt_tokens;
-    }
-
-    server_slot* get_slot(int id) {
-        int64_t t_last = ggml_time_us();
-        server_slot *last_used = nullptr;
-
-        for (server_slot & slot : slots)
-        {
-            if (slot.id == id && slot.available())
-            {
-                return &slot;
-            }
-
-            if (slot.available() && slot.t_last_used < t_last)
-            {
-                last_used = &slot;
-                t_last = slot.t_last_used;
-            }
-        }
-
-        return last_used;
-    }
-
-    bool launch_slot_with_data(server_slot* &slot, json data) {
-        slot_params default_params;
-        gpt_sampler_params default_sparams;
-
-        slot->params.stream             = json_value(data, "stream",            false);
-        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
-        slot->params.n_predict          = json_value(data, "n_predict",         default_params.n_predict);
-        slot->sparams.top_k             = json_value(data, "top_k",             default_sparams.top_k);
-        slot->sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
-        slot->sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
-        slot->sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot->sparams.typ_p             = json_value(data, "typ_p",             default_sparams.typ_p);
-        slot->sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
-        slot->sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
-        slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
-        slot->sparams.penalty_last_n    = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
-        slot->sparams.penalty_repeat    = json_value(data, "repeat_penalty",    default_sparams.penalty_repeat);
-        slot->sparams.penalty_freq      = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
-        slot->sparams.penalty_present   = json_value(data, "presence_penalty",  default_sparams.penalty_present);
-        slot->sparams.mirostat          = json_value(data, "mirostat",          default_sparams.mirostat);
-        slot->sparams.mirostat_tau      = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
-        slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
-        slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
-        slot->sparams.seed              = json_value(data, "seed",              default_params.seed);
-        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
-        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
-        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
-
-        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
-            // Might be better to reject the request with a 400 ?
-            LOG_WARNING("Max tokens to predict exceeds server configuration", {
-                {"params.n_predict", slot->params.n_predict},
-                {"slot.n_predict", slot->n_predict},
-            });
-            slot->params.n_predict = slot->n_predict;
-        }
-
-        if (data.count("input_suffix") != 0)
-        {
-            slot->params.input_suffix = data["input_suffix"];
-        }
-        else
-        {
-            slot->params.input_suffix = "";
-        }
-
-        if (data.count("prompt") != 0)
-        {
-            slot->prompt = data["prompt"];
-        }
-        else
-        {
-            slot->prompt = "";
-        }
-
-        slot->sparams.logit_bias.clear();
-
-        if (json_value(data, "ignore_eos", false))
-        {
-            slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
-        }
-
-        const auto &logit_bias = data.find("logit_bias");
-        if (logit_bias != data.end() && logit_bias->is_array())
-        {
-            const int n_vocab = llama_n_vocab(model);
-            for (const auto &el : *logit_bias)
-            {
-                if (el.is_array() && el.size() == 2)
-                {
-                    float bias;
-                    if (el[1].is_number())
-                    {
-                        bias = el[1].get<float>();
-                    }
-                    else if (el[1].is_boolean() && !el[1].get<bool>())
-                    {
-                        bias = -INFINITY;
-                    }
-                    else
-                    {
-                        continue;
-                    }
-
-                    if (el[0].is_number_integer())
-                    {
-                        llama_token tok = el[0].get<llama_token>();
-                        if (tok >= 0 && tok < n_vocab)
-                        {
-                            slot->sparams.logit_bias.push_back({tok, bias});
-                        }
-                    }
-                    else if (el[0].is_string())
-                    {
-                        auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
-                        for (auto tok : toks)
-                        {
-                            slot->sparams.logit_bias.push_back({tok, bias});
-                        }
-                    }
-                }
-            }
-        }
-
-        slot->params.antiprompt.clear();
-
-        const auto &stop = data.find("stop");
-        if (stop != data.end() && stop->is_array())
-        {
-            for (const auto &word : *stop)
-            {
-                if (!word.empty())
-                {
-                    slot->params.antiprompt.push_back(word);
-                }
-            }
-        }
-
-        const auto &samplers = data.find("samplers");
-        if (samplers != data.end() && samplers->is_array())
-        {
-            std::vector<std::string> sampler_names;
-            for (const auto &name : *samplers)
-            {
-                if (name.is_string())
-                {
-                    sampler_names.emplace_back(name);
-                }
-            }
-            slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
-        }
-        else
-        {
-            slot->sparams.samplers = default_sparams.samplers;
-        }
-
-        if (multimodal)
-        {
-            const auto &images_data = data.find("image_data");
-            if (images_data != data.end() && images_data->is_array())
-            {
-                for (const auto &img : *images_data)
-                {
-                    const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
-
-                    slot_image img_sl;
-                    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
-                    img_sl.img_data = clip_image_u8_init();
-                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
-                    {
-                        LOG_ERROR("failed to load image", {
-                            {"slot_id",   slot->id},
-                            {"img_sl_id", img_sl.id}
-                        });
-                        return false;
-                    }
-                    LOG_VERBOSE("image loaded", {
-                        {"slot_id",   slot->id},
-                        {"img_sl_id", img_sl.id}
-                    });
-                    img_sl.request_encode_image = true;
-                    slot->images.push_back(img_sl);
-                }
-                // process prompt
-                // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
-                if (slot->images.size() > 0 && !slot->prompt.is_array())
-                {
-                    std::string prompt = slot->prompt.get<std::string>();
-                    size_t pos = 0, begin_prefix = 0;
-                    std::string pattern = "[img-";
-                    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
-                        size_t end_prefix = pos;
-                        pos += pattern.length();
-                        size_t end_pos = prompt.find(']', pos);
-                        if (end_pos != std::string::npos)
-                        {
-                            std::string image_id = prompt.substr(pos, end_pos - pos);
-                            try
-                            {
-                                int img_id = std::stoi(image_id);
-                                bool found = false;
-                                for (slot_image &img : slot->images)
-                                {
-                                    if (img.id == img_id) {
-                                        found = true;
-                                        img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
-                                        begin_prefix = end_pos + 1;
-                                        break;
-                                    }
-                                }
-                                if (!found) {
-                                    LOG_WRN("ERROR: Image with id: %i, not found.\n", img_id);
-                                    slot->images.clear();
-                                    return false;
-                                }
-                            } catch (const std::invalid_argument& e) {
-                                LOG_WRN("Invalid image number id in prompt\n");
-                                slot->images.clear();
-                                return false;
-                            }
-                        }
-                    }
-                    slot->prompt = "";
-                    slot->params.input_suffix = prompt.substr(begin_prefix);
-                    slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
-                }
-            }
-        }
-
-        if (slot->smpl != nullptr)
-        {
-            gpt_sampler_free(slot->smpl);
-        }
-        slot->smpl = gpt_sampler_init(model, slot->sparams);
-        slot->command = LOAD_PROMPT;
-
-        all_slots_are_idle = false;
-
-        LOG_DEBUG("slot is processing task", {
-            {"slot_id", slot->id},
-            {"task_id", slot->task_id},
-        });
-
-        return true;
-    }
-
-    void kv_cache_clear() {
-        // clear the entire KV cache
-        llama_kv_cache_clear(ctx);
-        clean_kv_cache = false;
-    }
-
-    void system_prompt_update() {
-        kv_cache_clear();
-        system_tokens.clear();
-
-        if (!system_prompt.empty()) {
-            system_tokens = ::llama_tokenize(ctx, system_prompt, true);
-
-            llama_batch_clear(batch);
-
-            for (int i = 0; i < (int)system_tokens.size(); ++i)
-            {
-                llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
-            }
-
-            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
-            {
-                const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
-                llama_batch batch_view = {
-                    n_tokens,
-                    batch.token    + i,
-                    nullptr,
-                    batch.pos      + i,
-                    batch.n_seq_id + i,
-                    batch.seq_id   + i,
-                    batch.logits   + i,
-                    0, 0, 0, // unused
-                };
-                if (llama_decode(ctx, batch_view) != 0)
-                {
-                    LOG_WRN("%s: llama_decode() failed\n", __func__);
-                    return;
-                }
-            }
-
-            // assign the system KV cache to all parallel sequences
-            for (int32_t i = 1; i < params.n_parallel; ++i)
-            {
-                llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
-            }
-        }
-
-        LOG_INF("system prompt updated\n");
-        system_need_update = false;
-    }
-
-    void system_prompt_notify() {
-        // release all slots
-        for (server_slot &slot : slots)
-        {
-            slot.release();
-        }
-
-        system_need_update = true;
-    }
-
-    static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
-                                        const stop_type type, server_slot &slot)
-    {
-        size_t stop_pos = std::string::npos;
-
-        for (const std::string &word : slot.params.antiprompt)
-        {
-            size_t pos;
-            if (type == STOP_FULL)
-            {
-                const size_t tmp = word.size() + last_token_size;
-                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
-                pos = text.find(word, from_pos);
-            }
-            else
-            {
-                pos = find_partial_stop_string(word, text);
-            }
-            if (pos != std::string::npos &&
-                (stop_pos == std::string::npos || pos < stop_pos))
-            {
-                if (type == STOP_FULL)
-                {
-                    slot.stopped_word   = true;
-                    slot.stopping_word  = word;
-                    slot.has_next_token = false;
-                }
-                stop_pos = pos;
-            }
-        }
-
-        return stop_pos;
-    }
-
-    bool process_token(completion_token_output &result, server_slot &slot) {
-        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok);
-        slot.sampled = result.tok;
-
-        // search stop word and delete it
-        if (!llama_token_is_eog(model, result.tok))
-            slot.generated_text += token_str;
-
-        slot.has_next_token = true;
-
-        // check if there is incomplete UTF-8 character at the end
-        bool incomplete = false;
-        for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
-        {
-            unsigned char c = slot.generated_text[slot.generated_text.size() - i];
-            if ((c & 0xC0) == 0x80)
-            {
-                // continuation byte: 10xxxxxx
-                continue;
-            }
-            if ((c & 0xE0) == 0xC0)
-            {
-                // 2-byte character: 110xxxxx ...
-                incomplete = i < 2;
-            }
-            else if ((c & 0xF0) == 0xE0)
-            {
-                // 3-byte character: 1110xxxx ...
-                incomplete = i < 3;
-            }
-            else if ((c & 0xF8) == 0xF0)
-            {
-                // 4-byte character: 11110xxx ...
-                incomplete = i < 4;
-            }
-            // else 1-byte character or invalid byte
-            break;
-        }
-
-        if (!incomplete)
-        {
-            size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
-
-            if (!llama_token_is_eog(model, result.tok)) {
-                const std::string str_test = slot.generated_text.substr(pos);
-                bool is_stop_full = false;
-                size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
-                if (stop_pos != std::string::npos)
-                {
-                    is_stop_full = true;
-                    slot.generated_text.erase(
-                        slot.generated_text.begin() + pos + stop_pos,
-                        slot.generated_text.end());
-                    pos = std::min(slot.n_sent_text, slot.generated_text.size());
-                }
-                else
-                {
-                    is_stop_full = false;
-                    stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
-                }
-
-                // check if there is any token to predict
-                if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
-                {
-                    // no send the stop word in the response
-                    result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
-                    slot.n_sent_text += result.text_to_send.size();
-                    // add the token to slot queue and cache
-                }
-            } else {
-                    result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
-                    slot.n_sent_text += result.text_to_send.size();
-            }
-
-            if (slot.params.stream)
-            {
-                send_partial_response(slot, result);
-            }
-        }
-
-        slot.add_token_string(result);
-
-        if (incomplete)
-        {
-            slot.has_next_token = true;
-        }
-
-        // check the limits
-        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params))
-        {
-            slot.stopped_limit = true;
-            slot.has_next_token = false;
-        }
-
-        if (!slot.cache_tokens.empty() && llama_token_is_eog(model, result.tok))
-        {
-            slot.stopped_eos = true;
-            slot.has_next_token = false;
-            LOG_VERBOSE("eos token found", {});
-        }
-
-        LOG_VERBOSE("next token", {
-                                      {"token", result.tok},
-                                      {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
-                                      {"has_next_token", slot.has_next_token},
-                                      {"n_remain", slot.n_remaining},
-                                      {"num_tokens_predicted", slot.n_decoded},
-                                      {"stopped_eos", slot.stopped_eos},
-                                      {"stopped_word", slot.stopped_word},
-                                      {"stopped_limit", slot.stopped_limit},
-                                      {"stopping_word", slot.stopping_word},
-                                  });
-
-        return slot.has_next_token; // continue
-    }
-
-    bool process_images(server_slot &slot) const
-    {
-        for (slot_image &img : slot.images)
-        {
-            if (!img.request_encode_image)
-            {
-                continue;
-            }
-
-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
-                LOG_WRN("Error processing the given image");
-                return false;
-            }
-
-
-            img.request_encode_image = false;
-        }
-
-        return slot.images.size() > 0;
-    }
-
-    void send_error(task_server& task, const std::string &error)
-    {
-        LOG_WRN("task %i - error: %s\n", task.id, error.c_str());
-        task_result res;
-        res.id = task.id;
-        res.multitask_id = task.multitask_id;
-        res.stop = false;
-        res.error = true;
-        res.result_json = { { "content", error } };
-        queue_results.send(res);
-    }
-
-    json get_formated_generation(server_slot &slot)
-    {
-        std::vector<std::string> samplers;
-        samplers.reserve(slot.sparams.samplers.size());
-        for (const auto & sampler : slot.sparams.samplers) {
-            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
-        }
-
-        return json {
-            {"n_ctx",             slot.n_ctx},
-            {"n_predict",         slot.n_predict},
-            {"model",             params.model_alias},
-            {"seed",              slot.params.seed},
-            {"temperature",       slot.sparams.temp},
-            {"dynatemp_range",    slot.sparams.dynatemp_range},
-            {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
-            {"top_k",             slot.sparams.top_k},
-            {"top_p",             slot.sparams.top_p},
-            {"min_p",             slot.sparams.min_p},
-            {"tfs_z",             slot.sparams.tfs_z},
-            {"typical_p",         slot.sparams.typ_p},
-            {"repeat_last_n",     slot.sparams.penalty_last_n},
-            {"repeat_penalty",    slot.sparams.penalty_repeat},
-            {"presence_penalty",  slot.sparams.penalty_present},
-            {"frequency_penalty", slot.sparams.penalty_freq},
-            {"mirostat",          slot.sparams.mirostat},
-            {"mirostat_tau",      slot.sparams.mirostat_tau},
-            {"mirostat_eta",      slot.sparams.mirostat_eta},
-            {"penalize_nl",       slot.sparams.penalize_nl},
-            {"stop",              slot.params.antiprompt},
-            {"n_predict",         slot.params.n_predict},
-            {"n_keep",            params.n_keep},
-            {"ignore_eos",        slot.sparams.ignore_eos},
-            {"stream",            slot.params.stream},
-            //{"logit_bias",        slot.sparams.logit_bias},
-            {"n_probs",           slot.sparams.n_probs},
-            {"min_keep",          slot.sparams.min_keep},
-            {"grammar",           slot.sparams.grammar},
-            {"samplers",          samplers}
-        };
-    }
-
-    void send_partial_response(server_slot &slot, completion_token_output tkn)
-    {
-        task_result res;
-        res.id = slot.task_id;
-        res.multitask_id = slot.multitask_id;
-        res.error = false;
-        res.stop = false;
-
-        res.result_json = json
-        {
-            {"stop",       false},
-            {"slot_id",    slot.id},
-            {"multimodal", multimodal}
-        };
-
-        res.result_json["content"] = tkn.text_to_send;
-
-        if (slot.sparams.n_probs > 0)
-        {
-            std::vector<completion_token_output> probs_output = {};
-            const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
-            size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
-            size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
-            if (probs_pos < probs_stop_pos)
-            {
-                probs_output = std::vector<completion_token_output>(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos);
-            }
-            slot.n_sent_token_probs = probs_stop_pos;
-            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
-        }
-
-        queue_results.send(res);
-    }
-
-    void send_final_response(server_slot &slot)
-    {
-        task_result res;
-        res.id = slot.task_id;
-        res.multitask_id = slot.multitask_id;
-        res.error = false;
-        res.stop = true;
-
-        res.result_json = json
-        {
-            {"content",             !slot.params.stream ? slot.generated_text : ""},
-            {"slot_id",             slot.id},
-            {"stop",                true},
-            {"model",               params.model_alias},
-            {"tokens_predicted",    slot.n_decoded},
-            {"tokens_evaluated",    slot.n_prompt_tokens},
-            {"truncated",           slot.truncated},
-            {"stopped_eos",         slot.stopped_eos},
-            {"stopped_word",        slot.stopped_word},
-            {"stopped_limit",       slot.stopped_limit},
-            {"stopping_word",       slot.stopping_word},
-            {"tokens_cached",       slot.n_past},
-            {"timings",             slot.get_formated_timings()}
-        };
-
-        if (slot.sparams.n_probs > 0)
-        {
-            std::vector<completion_token_output> probs = {};
-            if (!slot.params.stream && slot.stopped_word)
-            {
-                const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
-                probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
-            }
-            else
-            {
-                probs = std::vector<completion_token_output>(
-                                    slot.generated_token_probs.begin(),
-                                    slot.generated_token_probs.end());
-            }
-            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
-        }
-
-        queue_results.send(res);
-    }
-
-    void send_embedding(server_slot & slot, const llama_batch & batch)
-    {
-        task_result res;
-        res.id = slot.task_id;
-        res.multitask_id = slot.multitask_id;
-        res.error = false;
-        res.stop = true;
-
-        const int n_embd = llama_n_embd(model);
-
-        if (!params.embedding)
-        {
-            LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
-            res.result_json = json
-            {
-                {"embedding", std::vector<float>(n_embd, 0.0f)},
-            };
-        }
-        else
-        {
-            for (int i = 0; i < batch.n_tokens; ++i) {
-                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
-                    continue;
-                }
-
-                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-                if (embd == NULL) {
-                    embd = llama_get_embeddings_ith(ctx, i);
-                    if (embd == NULL) {
-                        LOG_ERROR("failed to get embeddings for token", {{"token", batch.token[i]}, {"seq_id", batch.seq_id[i][0]}});
-                        res.result_json = json
-                        {
-                            {"embedding", std::vector<float>(n_embd, 0.0f)},
-                        };
-                        continue;
-                    }
-                }
-
-                res.result_json = json
-                {
-                    {"embedding", std::vector<float>(embd, embd + n_embd)},
-                };
-            }
-        }
-        queue_results.send(res);
-    }
-
-    void request_completion(int task_id, json data, bool embedding, int multitask_id)
-    {
-        task_server task;
-        task.id = task_id;
-        task.target_id = 0;
-        task.data = std::move(data);
-        task.embedding_mode = embedding;
-        task.type = TASK_TYPE_COMPLETION;
-        task.multitask_id = multitask_id;
-
-        // when a completion task's prompt array is not a singleton, we split it into multiple requests
-        // otherwise, it's a single-prompt task, we actually queue it
-        // if there's numbers in the prompt array it will be treated as an array of tokens
-        if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) {
-            bool numbers = false;
-            for (const auto& e : task.data.at("prompt")) {
-                if (e.is_number()) {
-                    numbers = true;
-                    break;
-                }
-            }
-
-            // NOTE: split_multiprompt_task() does not handle a mix of strings and numbers,
-            // it will completely stall the server. I don't know where the bug for this is.
-            //
-            // if there are numbers, it needs to be treated like a single prompt,
-            // queue_tasks handles a mix of strings and numbers just fine.
-            if (numbers) {
-                queue_tasks.post(task);
-            } else {
-                split_multiprompt_task(task_id, task);
-            }
-        } else {
-            // an empty prompt can make slot become buggy
-            if (task.data.contains("prompt") && task.data["prompt"].is_string() && task.data["prompt"].get<std::string>().empty()) {
-                task.data["prompt"] = " "; // add a space so that we have one token
-            }
-            queue_tasks.post(task);
-        }
-    }
-
-    // for multiple images processing
-    bool ingest_images(server_slot &slot, int n_batch)
-    {
-        int image_idx = 0;
-
-        while (image_idx < (int) slot.images.size())
-        {
-            slot_image &img = slot.images[image_idx];
-
-            // process prefix prompt
-            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
-            {
-                const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-                llama_batch batch_view = {
-                    n_tokens,
-                    batch.token    + i,
-                    nullptr,
-                    batch.pos      + i,
-                    batch.n_seq_id + i,
-                    batch.seq_id   + i,
-                    batch.logits   + i,
-                    0, 0, 0, // unused
-                };
-                if (llama_decode(ctx, batch_view))
-                {
-                    LOG_WRN("%s : failed to eval\n", __func__);
-                    return false;
-                }
-            }
-
-            // process image with llm
-            for (int i = 0; i < img.image_tokens; i += n_batch)
-            {
-                int n_eval = img.image_tokens - i;
-                if (n_eval > n_batch)
-                {
-                    n_eval = n_batch;
-                }
-
-                const int n_embd = llama_n_embd(model);
-                llama_batch batch_img = {
-                    n_eval,
-                    nullptr,
-                    (img.image_embedding + i * n_embd),
-                    nullptr,
-                    nullptr,
-                    nullptr,
-                    nullptr,
-                    slot.n_past,
-                    1, 0
-                };
-                if (llama_decode(ctx, batch_img))
-                {
-                    LOG_WRN("%s : failed to eval image\n", __func__);
-                    return false;
-                }
-                slot.n_past += n_eval;
-            }
-            image_idx++;
-
-            llama_batch_clear(batch);
-
-            // append prefix of next image
-            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
-                slot.params.input_suffix : // no more images, then process suffix prompt
-                (json)(slot.images[image_idx].prefix_prompt);
-
-            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
-            for (int i = 0; i < (int) append_tokens.size(); ++i)
-            {
-                llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
-                slot.n_past += 1;
-            }
-        }
-
-        return true;
-    }
-
-    void request_cancel(int task_id)
-    {
-        task_server task;
-        task.type = TASK_TYPE_CANCEL;
-        task.target_id = task_id;
-        queue_tasks.post(task);
-    }
-
-    void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
-    {
-        int prompt_count = multiprompt_task.data.at("prompt").size();
-        if (prompt_count <= 1) {
-            send_error(multiprompt_task, "error while handling multiple prompts");
-            return;
-        }
-
-        // generate all the ID for subtask
-        std::vector<int> subtask_ids(prompt_count);
-        for (int i = 0; i < prompt_count; i++)
-        {
-            subtask_ids[i] = queue_tasks.get_new_id();
-        }
-
-        // queue up the multitask so we can track its subtask progression
-        queue_tasks.add_multitask(multitask_id, subtask_ids);
-
-        // add subtasks
-        for (int i = 0; i < prompt_count; i++)
-        {
-            json subtask_data = multiprompt_task.data;
-            subtask_data["prompt"] = subtask_data["prompt"][i];
-
-            // subtasks inherit everything else (embedding mode, etc.)
-            request_completion(subtask_ids[i], subtask_data, multiprompt_task.embedding_mode, multitask_id);
-        }
-    }
-
-    std::string common_prefix(const std::string& str1, const std::string& str2) {
-        auto mismatch_pair = std::mismatch(str1.begin(), str1.end(), str2.begin());
-        return std::string(str1.begin(), mismatch_pair.first);
-    }
-
-    // Find the slot that has the greatest common prefix
-    server_slot *prefix_slot(const json &prompt) {
-        if (!prompt.is_string()) {
-            return nullptr;
-        }
-
-        std::string prompt_str = prompt.get<std::string>();
-        server_slot *slot = nullptr;
-        size_t longest = 0;
-
-        for (server_slot &s : slots) {
-            if (s.available() && s.prompt.is_string()) {
-                std::string s_prompt = s.prompt.get<std::string>();
-                std::string prefix = common_prefix(s_prompt, prompt_str);
-
-                if (prefix.size() > longest) {
-                    slot = &s;
-                    longest = prefix.size();
-                }
-            }
-        }
-
-        if (!slot) {
-            return get_slot(-1);
-        }
-
-        LOG_DEBUG("slot with common prefix found", {{
-            "slot_id", slot->id,
-            "characters", longest
-        }});
-        return slot;
-    }
-
-    void process_single_task(task_server& task)
-    {
-        switch (task.type)
-        {
-            case TASK_TYPE_COMPLETION: {
-                server_slot *slot = nullptr;
-                if (task.embedding_mode) {
-                    // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
-                    slot = slots[0].available() ? &slots[0] : nullptr;
-                } else {
-                    slot = prefix_slot(task.data["prompt"]);
-                }
-                if (slot == nullptr)
-                {
-                    // if no slot is available, we defer this task for processing later
-                    LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
-                    queue_tasks.defer(task);
-                    break;
-                }
-
-                slot->reset();
-
-                slot->embedding    = task.embedding_mode;
-                slot->task_id      = task.id;
-                slot->multitask_id = task.multitask_id;
-
-                if (!launch_slot_with_data(slot, task.data))
-                {
-                    // send error result
-                    send_error(task, "internal_error");
-                    break;
-                }
-            } break;
-            case TASK_TYPE_CANCEL: { // release slot linked with the task id
-                for (auto & slot : slots)
-                {
-                    if (slot.task_id == task.target_id)
-                    {
-                        slot.release();
-                        break;
-                    }
-                }
-            } break;
-            case TASK_TYPE_NEXT_RESPONSE: {
-                // do nothing
-            } break;
-            case TASK_TYPE_METRICS: {
-                json slots_data        = json::array();
-                int n_idle_slots       = 0;
-                int n_processing_slots = 0;
-
-                for (server_slot &slot: slots) {
-                    json slot_data = get_formated_generation(slot);
-                    slot_data["id"] = slot.id;
-                    slot_data["task_id"] = slot.task_id;
-                    slot_data["state"] = slot.state;
-                    slot_data["prompt"] = slot.prompt;
-                    slot_data["next_token"] = {
-                            {"has_next_token",       slot.has_next_token},
-                            {"n_remain",             slot.n_remaining},
-                            {"num_tokens_predicted", slot.n_decoded},
-                            {"stopped_eos",          slot.stopped_eos},
-                            {"stopped_word",         slot.stopped_word},
-                            {"stopped_limit",        slot.stopped_limit},
-                            {"stopping_word",        slot.stopping_word},
-                    };
-                    if (slot_data["state"] == IDLE) {
-                        n_idle_slots++;
-                    } else {
-                        n_processing_slots++;
-                    }
-                    slots_data.push_back(slot_data);
-                }
-                LOG_DEBUG("slot data", {
-                    {"task_id",            task.id},
-                    {"n_idle_slots",       n_idle_slots},
-                    {"n_processing_slots", n_processing_slots}
-                });
-                LOG_VERBOSE("slot data", {
-                    {"task_id",            task.id},
-                    {"n_idle_slots",       n_idle_slots},
-                    {"n_processing_slots", n_processing_slots},
-                    {"slots",              slots_data}
-                });
-                task_result res;
-                res.id = task.id;
-                res.multitask_id = task.multitask_id;
-                res.stop = true;
-                res.error = false;
-                res.result_json = {
-                        { "idle",                            n_idle_slots       },
-                        { "processing",                      n_processing_slots },
-                        { "deferred",                        queue_tasks.queue_tasks_deferred.size() },
-
-                        { "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
-                        { "n_tokens_predicted_total",        metrics.n_tokens_predicted_total},
-
-                        { "n_prompt_tokens_processed",       metrics.n_prompt_tokens_processed},
-                        { "t_prompt_processing",             metrics.t_prompt_processing},
-                        { "n_tokens_predicted",              metrics.n_tokens_predicted},
-                        { "t_tokens_generation",             metrics.t_tokens_generation},
-
-                        { "kv_cache_tokens_count",           llama_get_kv_cache_token_count(ctx)},
-                        { "kv_cache_used_cells",             llama_get_kv_cache_used_cells(ctx)},
-
-                        { "slots",                           slots_data },
-                };
-                metrics.reset_bucket();
-                queue_results.send(res);
-            } break;
-        }
-    }
-
-    void on_finish_multitask(task_multi& multitask)
-    {
-        // all subtasks done == multitask is done
-        task_result result;
-        result.id = multitask.id;
-        result.stop = true;
-        result.error = false;
-
-        // collect json results into one json result
-        std::vector<json> result_jsons;
-        for (auto& subres : multitask.results)
-        {
-            result_jsons.push_back(subres.result_json);
-            result.error = result.error && subres.error;
-        }
-        result.result_json = json{ { "results", result_jsons } };
-        queue_results.send(result);
-    }
-
-    bool update_slots() {
-        if (system_need_update)
-        {
-            LOG_DEBUG("updating system prompt", {});
-            system_prompt_update();
-        }
-
-        llama_batch_clear(batch);
-
-        if (all_slots_are_idle)
-        {
-            if (system_prompt.empty() && clean_kv_cache)
-            {
-                LOG_DEBUG("all slots are idle and system prompt is empty, clear the KV cache", {});
-                kv_cache_clear();
-            }
-            return true;
-        }
-
-        LOG_VERBOSE("posting NEXT_RESPONSE", {});
-        task_server task;
-        task.type = TASK_TYPE_NEXT_RESPONSE;
-        task.target_id = -1;
-        queue_tasks.post(task);
-
-        for (server_slot &slot : slots)
-        {
-            if (slot.ga_n == 1)
-            {
-                if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
-                {
-                    // Shift context
-                    const int n_keep    = slot.params.n_keep + add_bos_token;
-                    const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
-                    const int n_discard = n_left / 2;
-
-                    LOG_DEBUG("slot context shift", {
-                        {"slot_id",         slot.id},
-                        {"task_id",         slot.task_id},
-                        {"n_keep",          n_keep},
-                        {"n_left",          n_left},
-                        {"n_discard",       n_discard},
-                        {"n_ctx",           n_ctx},
-                        {"n_past",          slot.n_past},
-                        {"n_system_tokens", system_tokens.size()},
-                        {"n_cache_tokens",  slot.cache_tokens.size()}
-                    });
-                    llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
-
-                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
-                    {
-                        slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
-                    }
-
-                    slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
-
-                    slot.n_past -= n_discard;
-
-                    slot.truncated = true;
-                }
-            }
-        }
-
-        // decode any currently ongoing sequences
-        LOG_VERBOSE("decoding ongoing sequences", {});
-        for (auto & slot : slots)
-        {
-            // release the slot
-            if (slot.command == RELEASE)
-            {
-                slot.state = IDLE;
-                slot.command = NONE;
-                slot.t_last_used = ggml_time_us();
-
-                LOG_DEBUG("slot released", {
-                    {"slot_id",         slot.id},
-                    {"task_id",         slot.task_id},
-                    {"n_ctx",           n_ctx},
-                    {"n_past",          slot.n_past},
-                    {"n_system_tokens", system_tokens.size()},
-                    {"n_cache_tokens",  slot.cache_tokens.size()},
-                    {"truncated",       slot.truncated}
-                });
-                queue_tasks.notify_slot_changed();
-
-                continue;
-            }
-
-            if (slot.state == IDLE)
-            {
-                continue;
-            }
-
-            slot.i_batch = batch.n_tokens;
-
-            const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
-
-            // TODO: we always have to take into account the "system_tokens"
-            //       this is not great and needs to be improved somehow
-            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
-            slot.n_past += 1;
-        }
-
-        // process in chunks of params.n_batch
-        int32_t n_batch = params.n_batch;
-
-        // assign workload to the slots
-        if (params.cont_batching || batch.n_tokens == 0)
-        {
-            for (auto & slot : slots)
-            {
-                const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
-
-                // empty prompt passed -> release the slot and send empty response
-                if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
-                {
-                    slot.release();
-                    slot.print_timings();
-                    send_final_response(slot);
-                    continue;
-                }
-
-                // need process the prompt
-                if (slot.state == IDLE && slot.command == LOAD_PROMPT)
-                {
-                    slot.state = PROCESSING;
-                    slot.command = NONE;
-                    std::vector<llama_token> prompt_tokens;
-                    slot.t_start_process_prompt = ggml_time_us();
-                    slot.t_start_genereration = 0;
-
-                    prompt_tokens = tokenize(slot.prompt, system_prompt.empty());  // add BOS if there isn't system prompt
-
-                    slot.n_prompt_tokens = prompt_tokens.size();
-
-                    if (slot.params.n_keep < 0)
-                    {
-                        slot.params.n_keep = slot.n_prompt_tokens;
-                    }
-                    slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
-
-                    // if input prompt is too big, truncate it, if group attention self-extend is disabled
-                    if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
-                    {
-                        const int n_left = slot.n_ctx - slot.params.n_keep;
-                        const int n_shift = n_left / 2;
-                        const int n_erase = slot.n_prompt_tokens - slot.params.n_keep - n_shift;
-
-                        std::vector<llama_token> new_tokens(
-                            prompt_tokens.begin(),
-                            prompt_tokens.begin() + slot.params.n_keep);
-                        new_tokens.insert(
-                            new_tokens.end(),
-                            prompt_tokens.begin() + slot.params.n_keep + n_erase,
-                            prompt_tokens.end());
-
-                        LOG_INFO("input truncated", {
-                            {"n_ctx",        slot.n_ctx},
-                            {"n_keep",       slot.params.n_keep},
-                            {"n_left",       n_left},
-                            {"n_shift",      n_shift},
-                            {"n_erase",      n_erase},
-                        });
-                        slot.truncated = true;
-                        prompt_tokens = new_tokens;
-
-                        slot.n_prompt_tokens = prompt_tokens.size();
-                        GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
-                    }
-
-                    if (!slot.params.cache_prompt)
-                    {
-                        gpt_sampler_reset(slot.smpl);
-
-                        slot.n_past    = 0;
-                        slot.n_past_se = 0;
-                        slot.ga_i      = 0;
-                        slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
-                    }
-                    else
-                    {
-                        // push the prompt into the sampling context (do not apply grammar)
-                        for (auto &token : prompt_tokens)
-                        {
-                            gpt_sampler_accept(slot.smpl, token, false);
-                        }
-
-                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
-
-                        // the last token of the cache is not in the KV cache until the next call to llama_decode
-                        // (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
-                        if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
-                        {
-                            slot.n_past -= 1;
-                        }
-
-                        slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
-
-                        if (slot.ga_n != 1)
-                        {
-                            int ga_i = 0;
-                            int32_t ga_n = slot.ga_n;
-                            int32_t ga_w = slot.ga_w;
-                            int32_t slot_npast = 0;
-                            for (int k = 0; k < slot.n_past; ++k)
-                            {
-                                while (slot_npast >= ga_i + ga_w) {
-                                    const int bd = (ga_w/ga_n)*(ga_n - 1);
-                                    slot_npast -= bd;
-                                    ga_i += ga_w/ga_n;
-                                }
-                                slot_npast++;
-                            }
-                            slot.n_past_se = slot_npast;
-                            slot.ga_i = ga_i;
-                        }
-
-                        LOG_DEBUG("slot progression", {
-                            { "slot_id",    slot.id },
-                            { "task_id",    slot.task_id },
-                            { "n_past",     slot.n_past },
-                            { "n_past_se",  slot.n_past_se },
-                            { "ga_i",       slot.ga_i },
-                            { "n_prompt_tokens_processed", slot.n_prompt_tokens_processed }
-                        });
-                    }
-
-                    slot.cache_tokens = prompt_tokens;
-
-                    if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0)
-                    {
-                        // we have to evaluate at least 1 token to generate logits.
-                        LOG_DEBUG("we have to evaluate at least 1 token to generate logits", {
-                            { "slot_id", slot.id },
-                            { "task_id", slot.task_id }
-                        });
-                        slot.n_past--;
-                        if (slot.ga_i > 0)
-                        {
-                            slot.n_past_se--;
-                        }
-                    }
-
-                    int p0 = (int) system_tokens.size() + slot.n_past;
-                    LOG_DEBUG("kv cache rm [p0, end)", {
-                        { "slot_id", slot.id },
-                        { "task_id", slot.task_id },
-                        { "p0",      p0 }
-                    });
-                    llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
-
-                    LOG_VERBOSE("prompt ingested", {
-                                                    {"n_past",  slot.n_past},
-                                                    {"cached",  tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
-                                                    {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
-                                                });
-
-                    const bool has_images = process_images(slot);
-
-                    // process the prefix of first image
-                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
-
-                    int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
-
-                    int32_t ga_i = slot.ga_i;
-                    int32_t ga_n = slot.ga_n;
-                    int32_t ga_w = slot.ga_w;
-
-                    for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
-                    {
-                        if (slot.ga_n != 1)
-                        {
-                            while (slot_npast >= ga_i + ga_w) {
-                                const int bd = (ga_w/ga_n)*(ga_n - 1);
-                                slot_npast -= bd;
-                                ga_i += ga_w/ga_n;
-                            }
-                        }
-                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);
-                        slot_npast++;
-                    }
-
-                    if (has_images && !ingest_images(slot, n_batch))
-                    {
-                        LOG_ERROR("failed processing images", {
-                            {"slot_id", slot.id},
-                            {"task_id", slot.task_id},
-                        });
-                        // FIXME @phymbert: to be properly tested
-                        //  early returning without changing the slot state will block the slot for ever
-                        // no one at the moment is checking the return value
-                        return false;
-                    }
-
-                    // extract the logits only for the last token
-                    if (batch.n_tokens > 0)
-                    {
-                        batch.logits[batch.n_tokens - 1] = true;
-                    }
-
-                    slot.n_decoded = 0;
-                    slot.i_batch   = batch.n_tokens - 1;
-                }
-            }
-        }
-
-        if (batch.n_tokens == 0)
-        {
-            all_slots_are_idle = true;
-            return true;
-        }
-
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
-        {
-            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
-
-            for (auto & slot : slots)
-            {
-                if (slot.ga_n != 1)
-                {
-                    // context extension via Self-Extend
-                    while (slot.n_past_se >= slot.ga_i + slot.ga_w)
-                    {
-                        const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
-                        const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
-                        const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
-
-                        LOG_DBG("\n");
-                        LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
-
-                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
-                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
-                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
-
-                        slot.n_past_se -= bd;
-
-                        slot.ga_i += slot.ga_w / slot.ga_n;
-
-                        LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
-                    }
-                    slot.n_past_se += n_tokens;
-                }
-            }
-
-            llama_batch batch_view =
-            {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-                0, 0, 0, // unused
-            };
-
-            const int ret = llama_decode(ctx, batch_view);
-
-            if (ret != 0)
-            {
-                if (n_batch == 1 || ret < 0)
-                {
-                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_WRN("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
-                    return false;
-                }
-
-                LOG_WRN("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
-
-                // retry with half the batch size to try to find a free slot in the KV cache
-                n_batch /= 2;
-                i -= n_batch;
-                continue;
-            }
-
-            for (auto & slot : slots)
-            {
-                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
-                {
-                    continue;
-                }
-
-                // prompt evaluated for embedding
-                if (slot.embedding)
-                {
-                    send_embedding(slot, batch_view);
-                    slot.release();
-                    slot.i_batch = -1;
-                    continue;
-                }
-
-                completion_token_output result;
-                const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
-
-                gpt_sampler_accept(slot.smpl, id, true);
-
-                slot.n_decoded += 1;
-                if (slot.n_decoded == 1)
-                {
-                    slot.t_start_genereration = ggml_time_us();
-                    slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
-                    metrics.on_prompt_eval(slot);
-                }
-
-                result.tok = id;
-                const auto * cur_p = gpt_sampler_get_candidates(slot.smpl);
-
-                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
-                    result.probs.push_back({
-                        cur_p->data[i].id,
-                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
-                    });
-                 }
-
-                if (!process_token(result, slot))
-                {
-                    slot.release();
-                    slot.print_timings();
-                    send_final_response(slot);
-                    metrics.on_prediction(slot);
-                }
-
-                slot.i_batch = -1;
-            }
-        }
-
-        LOG_VERBOSE("slots updated", {});
-        return true;
-    }
-
-    json model_meta() {
-        return json{
-                {"vocab_type", llama_vocab_type(model)},
-                {"n_vocab", llama_n_vocab(model)},
-                {"n_ctx_train", llama_n_ctx_train(model)},
-                {"n_embd", llama_n_embd(model)},
-                {"n_params", llama_model_n_params(model)},
-                {"size", llama_model_size(model)},
-        };
-    }
-};
-
-static void server_print_usage(const char *argv0, const gpt_params &params,
-                               const server_params &sparams)
-{
-    printf("usage: %s [options]\n", argv0);
-    printf("\n");
-    printf("options:\n");
-    printf("  -h, --help                show this help message and exit\n");
-    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads);
-    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  --threads-http N          number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
-    printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-scaling {none,linear,yarn}\n");
-    printf("                            RoPE frequency scaling method, defaults to linear unless specified by the model\n");
-    printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
-    printf("  --rope-freq-scale N       RoPE frequency scaling factor, expands context by a factor of 1/N\n");
-    printf("  --yarn-ext-factor N       YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
-    printf("  --yarn-attn-factor N      YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
-    printf("  --yarn-beta-slow N        YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
-    printf("  --yarn-beta-fast N        YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
-    printf("  --pooling {none,mean,cls}\n");
-    printf("                        pooling type for embeddings, use model default if unspecified\n");
-    printf("  -b N, --batch-size N      batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
-    if (llama_supports_mlock())
-    {
-        printf("  --mlock                   force system to keep model in RAM rather than swapping or compressing\n");
-    }
-    if (llama_supports_mmap())
-    {
-        printf("  --no-mmap                 do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
-    }
-    printf("  --numa TYPE               attempt optimizations that help on some NUMA systems\n");
-    printf("                              - distribute: spread execution evenly over all nodes\n");
-    printf("                              - isolate: only spawn threads on CPUs on the node that execution started on\n");
-    printf("                              - numactl: use the CPU map provided my numactl\n");
-    if (llama_supports_gpu_offload()) {
-        printf("  -ngl N, --n-gpu-layers N\n");
-        printf("                            number of layers to store in VRAM\n");
-        printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
-        printf("                            how to split the model across multiple GPUs, one of:\n");
-        printf("                              - none: use one GPU only\n");
-        printf("                              - layer (default): split layers and KV across GPUs\n");
-        printf("                              - row: split rows across GPUs\n");
-        printf("  -ts SPLIT --tensor-split SPLIT\n");
-        printf("                            fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
-        printf("  -mg i, --main-gpu i       the GPU to use for the model (with split-mode = none),\n");
-        printf("                            or for intermediate results and KV (with split-mode = row)\n");
-    }
-    printf("  -m FNAME, --model FNAME\n");
-    printf("                            model path (default: %s)\n", params.model.c_str());
-    printf("  -a ALIAS, --alias ALIAS\n");
-    printf("                            set an alias for the model, will be added as `model` field in completion response\n");
-    printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
-    printf("  --lora-base FNAME         optional model to use as a base for the layers modified by the LoRA adapter\n");
-    printf("  --host                    ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
-    printf("  --port PORT               port to listen (default  (default: %d)\n", sparams.port);
-    printf("  --path PUBLIC_PATH        path from which to serve static files (default %s)\n", sparams.public_path.c_str());
-    printf("  --api-key API_KEY         optional api key to enhance server security. If set, requests must include this key for access.\n");
-    printf("  --api-key-file FNAME      path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
-    printf("  -to N, --timeout N        server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
-    printf("  --embedding               enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
-    printf("  -np N, --parallel N       number of slots for process requests (default: %d)\n", params.n_parallel);
-    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
-    printf("  -fa, --flash-attn         enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
-    printf("  -spf FNAME, --system-prompt-file FNAME\n");
-    printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
-    printf("  -ctk TYPE, --cache-type-k TYPE\n");
-    printf("                            KV cache data type for K (default: f16)\n");
-    printf("  -ctv TYPE, --cache-type-v TYPE\n");
-    printf("                            KV cache data type for V (default: f16)\n");
-    printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
-    printf("  --log-format              log output format: json or text (default: json)\n");
-    printf("  --log-disable             disables logging to a file.\n");
-    printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
-    printf("  --metrics                 enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
-    printf("\n");
-    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
-    printf("  --override-kv KEY=TYPE:VALUE\n");
-    printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
-    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
-    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
-    printf("  --chat-template JINJA_TEMPLATE\n");
-    printf("                            set custom jinja chat template (default: template taken from model's metadata)\n");
-    printf("                            Note: only commonly used templates are accepted, since we don't have jinja parser\n");
-    printf("\n");
-}
-
-static void server_params_parse(int argc, char **argv, server_params &sparams, gpt_params &params)
-{
-    gpt_params default_params;
-    server_params default_sparams;
-    std::string arg;
-    bool invalid_param = false;
-
-    for (int i = 1; i < argc; i++)
-    {
-        arg = argv[i];
-        if (arg == "--port")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.port = std::stoi(argv[i]);
-        }
-        else if (arg == "--host")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.hostname = argv[i];
-        }
-        else if (arg == "--path")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.public_path = argv[i];
-        }
-        else if (arg == "--api-key")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.api_keys.emplace_back(argv[i]);
-        }
-        else if (arg == "--api-key-file")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream key_file(argv[i]);
-            if (!key_file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            std::string key;
-            while (std::getline(key_file, key)) {
-               if (key.size() > 0) {
-                   sparams.api_keys.push_back(key);
-               }
-            }
-            key_file.close();
-        }
-        else if (arg == "--timeout" || arg == "-to")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.read_timeout = std::stoi(argv[i]);
-            sparams.write_timeout = std::stoi(argv[i]);
-        }
-        else if (arg == "-m" || arg == "--model")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.model = argv[i];
-        }
-        else if (arg == "-a" || arg == "--alias")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.model_alias = argv[i];
-        }
-        else if (arg == "-h" || arg == "--help")
-        {
-            server_print_usage(argv[0], default_params, default_sparams);
-            exit(0);
-        }
-        else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_ctx = std::stoi(argv[i]);
-        }
-        else if (arg == "--rope-scaling")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
-            else { invalid_param = true; break; }
-        }
-        else if (arg == "--rope-freq-base")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.rope_freq_base = std::stof(argv[i]);
-        }
-        else if (arg == "--rope-freq-scale")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.rope_freq_scale = std::stof(argv[i]);
-        }
-        else if (arg == "--yarn-ext-factor")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_ext_factor = std::stof(argv[i]);
-        }
-        else if (arg == "--yarn-attn-factor")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_attn_factor = std::stof(argv[i]);
-        }
-        else if (arg == "--yarn-beta-fast")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_beta_fast = std::stof(argv[i]);
-        }
-        else if (arg == "--yarn-beta-slow")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_beta_slow = std::stof(argv[i]);
-        }
-        else if (arg == "--pooling")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string value(argv[i]);
-            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
-            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
-            else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
-            else { invalid_param = true; break; }
-        }
-        else if (arg == "--threads" || arg == "-t")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.cpuparams.n_threads = std::stoi(argv[i]);
-        }
-        else if (arg == "--grp-attn-n" || arg == "-gan")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-
-            params.grp_attn_n = std::stoi(argv[i]);
-        }
-        else if (arg == "--grp-attn-w" || arg == "-gaw")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-
-            params.grp_attn_w = std::stoi(argv[i]);
-        }
-        else if (arg == "--threads-batch" || arg == "-tb")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.cpuparams_batch.n_threads = std::stoi(argv[i]);
-        }
-        else if (arg == "--threads-http")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.n_threads_http = std::stoi(argv[i]);
-        }
-        else if (arg == "-b" || arg == "--batch-size")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_batch = std::stoi(argv[i]);
-        }
-        else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            if (llama_supports_gpu_offload()) {
-                params.n_gpu_layers = std::stoi(argv[i]);
-            } else {
-                LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
-                        "See main README.md for information on enabling GPU BLAS support",
-                        {{"n_gpu_layers", params.n_gpu_layers}});
-            }
-        }
-        else if (arg == "--split-mode" || arg == "-sm")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string arg_next = argv[i];
-            if (arg_next == "none")
-            {
-                params.split_mode = LLAMA_SPLIT_MODE_NONE;
-            }
-            else if (arg_next == "layer")
-            {
-                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
-            }
-            else if (arg_next == "row")
-            {
-                params.split_mode = LLAMA_SPLIT_MODE_ROW;
-            }
-            else {
-                invalid_param = true;
-                break;
-            }
-#ifndef GGML_USE_CUDA
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA
-        }
-        else if (arg == "--tensor-split" || arg == "-ts")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
-            std::string arg_next = argv[i];
-
-            // split string by , and /
-            const std::regex regex{R"([,/]+)"};
-            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
-            std::vector<std::string> split_arg{it, {}};
-            GGML_ASSERT(split_arg.size() <= llama_max_devices());
-
-            for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device)
-            {
-                if (i_device < split_arg.size())
-                {
-                    params.tensor_split[i_device] = std::stof(split_arg[i_device]);
-                }
-                else
-                {
-                    params.tensor_split[i_device] = 0.0f;
-                }
-            }
-#else
-            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUDA
-        }
-        else if (arg == "--main-gpu" || arg == "-mg")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
-            params.main_gpu = std::stoi(argv[i]);
-#else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
-#endif
-        }
-        else if (arg == "--lora")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapters.push_back({
-                std::string(argv[i]),
-                1.0,
-            });
-            params.use_mmap = false;
-        }
-        else if (arg == "--lora-scaled")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            const char * lora_adapter = argv[i];
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapters.push_back({
-                lora_adapter,
-                std::stof(argv[i])
-            });
-            params.use_mmap = false;
-        }
-        else if (arg == "-v" || arg == "--verbose")
-        {
-            server_verbose = true;
-        }
-        else if (arg == "--mlock")
-        {
-            params.use_mlock = true;
-        }
-        else if (arg == "--no-mmap")
-        {
-            params.use_mmap = false;
-        }
-        else if (arg == "--numa")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            } else {
-                std::string value(argv[i]);
-                /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
-                else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
-                else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
-                else { invalid_param = true; break; }
-            }
-        }
-        else if (arg == "--embedding")
-        {
-            params.embedding = true;
-        }
-        else if (arg == "-cb" || arg == "--cont-batching")
-        {
-            params.cont_batching = true;
-        }
-        else if (arg == "-fa" || arg == "--flash-attn")
-        {
-            params.flash_attn = true;
-        }
-        else if (arg == "-np" || arg == "--parallel")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_parallel = std::stoi(argv[i]);
-        }
-        else if (arg == "-n" || arg == "--n-predict")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_predict = std::stoi(argv[i]);
-        }
-        else if (arg == "-ctk" || arg == "--cache-type-k") {
-            params.cache_type_k = argv[++i];
-        }
-        else if (arg == "-ctv" || arg == "--cache-type-v") {
-            params.cache_type_v = argv[++i];
-        }
-        else if(arg == "--mmproj")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.mmproj = argv[i];
-        }
-        else if (arg == "--log-format")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            if (std::strcmp(argv[i], "json") == 0)
-            {
-                server_log_json = true;
-            }
-            else if (std::strcmp(argv[i], "text") == 0)
-            {
-                server_log_json = false;
-            }
-            else
-            {
-                invalid_param = true;
-                break;
-            }
-        }
-        else if (arg == "--log-disable")
-        {
-            LOG_WARNING("DEPRECATED: --log-disable does nothing anymore", {});
-        }
-        else if (arg == "--slots-endpoint-disable")
-        {
-            sparams.slots_endpoint = false;
-        }
-        else if (arg == "--metrics")
-        {
-            sparams.metrics_endpoint = true;
-        }
-        else if (arg == "--chat-template")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            if (!verify_custom_template(argv[i])) {
-                fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
-                fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
-                invalid_param = true;
-                break;
-            }
-        }
-        else if (arg == "--override-kv")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            char * sep = strchr(argv[i], '=');
-            if (sep == nullptr || sep - argv[i] >= 128) {
-                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            struct llama_model_kv_override kvo;
-            std::strncpy(kvo.key, argv[i], sep - argv[i]);
-            kvo.key[sep - argv[i]] = 0;
-            sep++;
-            if (strncmp(sep, "int:", 4) == 0) {
-                sep += 4;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-                kvo.val_i64 = std::atol(sep);
-            } else if (strncmp(sep, "float:", 6) == 0) {
-                sep += 6;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-                kvo.val_f64 = std::atof(sep);
-            } else if (strncmp(sep, "bool:", 5) == 0) {
-                sep += 5;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-                if (std::strcmp(sep, "true") == 0) {
-                    kvo.val_bool = true;
-                } else if (std::strcmp(sep, "false") == 0) {
-                    kvo.val_bool = false;
-                } else {
-                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
-                    invalid_param = true;
-                    break;
-                }
-            } else {
-                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            params.kv_overrides.push_back(kvo);
-        }
-        else
-        {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            server_print_usage(argv[0], default_params, default_sparams);
-            exit(1);
-        }
-    }
-    if (!params.kv_overrides.empty()) {
-        params.kv_overrides.emplace_back();
-        params.kv_overrides.back().key[0] = 0;
-    }
-
-    postprocess_cpu_params(params.cpuparams, nullptr);
-    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
-    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
-    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
-
-    if (invalid_param)
-    {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        server_print_usage(argv[0], default_params, default_sparams);
-        exit(1);
-    }
-}
-
-/* llama.cpp completion api semantics */
-static json format_partial_response(
-    llama_server_context &llama, server_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
-) {
-    json res = json
-    {
-        {"content",    content },
-        {"stop",       false},
-        {"slot_id",    slot->id },
-        {"multimodal", llama.multimodal }
-    };
-
-    if (slot->sparams.n_probs > 0)
-    {
-        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
-    }
-
-    return res;
-}
-
-static json format_tokenizer_response(const std::vector<llama_token> &tokens)
-{
-    return json {
-        {"tokens", tokens}
-    };
-}
-
-static json format_detokenized_response(std::string content)
-{
-    return json {
-        {"content", content}
-    };
-}
-
-
-static void log_server_request(const httplib::Request &req, const httplib::Response &res)
-{
-    // skip GH copilot requests when using default port
-    if (req.path == "/health" || req.path == "/v1/health" || req.path == "/v1/completions")
-    {
-        return;
-    }
-
-    LOG_DEBUG("request", {
-        {"remote_addr", req.remote_addr},
-        {"remote_port", req.remote_port},
-        {"status",      res.status},
-        {"method",      req.method},
-        {"path",        req.path},
-        {"params",      req.params},
-    });
-
-    LOG_VERBOSE("request", {
-        {"request",  req.body},
-        {"response", res.body},
-    });
-}
-
-static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, server_slot *slot)
-{
-    auto & gtps = slot->generated_token_probs;
-    auto translator = token_translator{llama.ctx};
-    auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
-    const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
-    if (slot->generated_text.capacity() < slot->generated_text.size() + len)
-    {
-        slot->generated_text.reserve(slot->generated_text.size() + len);
-    }
-    for (const completion_token_output & cto : gtps)
-    {
-        slot->generated_text += translator(cto);
-    }
-}
-
-std::function<void(int)> shutdown_handler;
-std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
-inline void signal_handler(int signal) {
-    if (is_terminating.test_and_set()) {
-        // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
-        // this is for better developer experience, we can remove when the server is stable enough
-        fprintf(stderr, "Received second interrupt, terminating immediately.\n");
-        exit(1);
-    }
-    shutdown_handler(signal);
-}
-
-static bool update_load_progress(float progress, void *data)
-{
-    ((llama_server_context*)data)->modelProgress = progress;
-    return true;
-}
-
-#if defined(_WIN32)
-char* wchar_to_char(const wchar_t* wstr) {
-    if (wstr == nullptr) return nullptr;
-
-    // Determine the number of bytes needed for the UTF-8 string
-    int bytes = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, nullptr, 0, nullptr, nullptr);
-    char* str = new char[bytes];
-
-    // Convert the wide-character string to a UTF-8 string
-    WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, bytes, nullptr, nullptr);
-    return str;
-}
-
-int wmain(int argc, wchar_t **wargv) {
-    char** argv = new char*[argc];
-    for (int i = 0; i < argc; ++i) {
-        argv[i] = wchar_to_char(wargv[i]);
-    }
-
-    // Adjust error mode to avoid error dialog after we start.
-    SetErrorMode(SEM_FAILCRITICALERRORS);
-#else
-int main(int argc, char **argv) {
-#endif
-
-#if SERVER_VERBOSE != 1
-    gpt_log_set_verbosity_thold(-1);
-#endif
-    // own arguments required by this example
-    gpt_params params;
-    server_params sparams;
-
-    // struct that contains llama context and inference
-    llama_server_context llama;
-
-    server_params_parse(argc, argv, sparams, params);
-
-    if (params.model_alias == "unknown")
-    {
-        params.model_alias = params.model;
-    }
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    LOG_INFO("starting c++ runner", {});
-    LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
-                            {"commit", LLAMA_COMMIT}});
-
-    LOG_INFO("system info", {
-                                {"n_threads", params.cpuparams.n_threads},
-                                {"n_threads_batch", params.cpuparams_batch.n_threads},
-                                {"total_threads", std::thread::hardware_concurrency()},
-                                {"system_info", llama_print_system_info()},
-                            });
-
-    httplib::Server svr;
-
-    std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
-
-    svr.set_default_headers({{"Server", "llama.cpp"}});
-
-    // CORS preflight
-    svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res) {
-        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-        res.set_header("Access-Control-Allow-Credentials", "true");
-        res.set_header("Access-Control-Allow-Methods", "POST");
-        res.set_header("Access-Control-Allow-Headers", "*");
-    });
-
-    svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
-        server_state current_state = state.load();
-        switch(current_state) {
-            case SERVER_STATE_READY: {
-                // request slots data using task queue
-                task_server task;
-                task.id   = llama.queue_tasks.get_new_id();
-                task.type = TASK_TYPE_METRICS;
-                task.target_id = -1;
-
-                llama.queue_results.add_waiting_task_id(task.id);
-                llama.queue_tasks.post(task);
-
-                // get the result
-                task_result result = llama.queue_results.recv(task.id);
-                llama.queue_results.remove_waiting_task_id(task.id);
-
-                int n_idle_slots       = result.result_json["idle"];
-                int n_processing_slots = result.result_json["processing"];
-
-                json health = {
-                        {"status",           "ok"},
-                        {"slots_idle",       n_idle_slots},
-                        {"slots_processing", n_processing_slots}};
-                res.status = 200; // HTTP OK
-                if (sparams.slots_endpoint && req.has_param("include_slots")) {
-                    health["slots"] = result.result_json["slots"];
-                }
-
-                if (n_idle_slots == 0) {
-                    health["status"] = "no slot available";
-                    if (req.has_param("fail_on_no_slot")) {
-                        res.status = 503; // HTTP Service Unavailable
-                    }
-                }
-                res.set_content(health.dump(), "application/json");
-                break;
-            }
-            case SERVER_STATE_LOADING_MODEL:
-                char buf[128];
-                snprintf(&buf[0], 128, R"({"status": "loading model", "progress": %0.2f})", llama.modelProgress);
-                res.set_content(buf, "application/json");
-                res.status = 503; // HTTP Service Unavailable
-                break;
-            case SERVER_STATE_ERROR:
-                res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json");
-                res.status = 500; // HTTP Internal Server Error
-                break;
-        }
-    });
-
-    if (sparams.slots_endpoint) {
-        svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
-            // request slots data using task queue
-            task_server task;
-            task.id = llama.queue_tasks.get_new_id();
-            task.type = TASK_TYPE_METRICS;
-            task.target_id = -1;
-
-            llama.queue_results.add_waiting_task_id(task.id);
-            llama.queue_tasks.post(task);
-
-            // get the result
-            task_result result = llama.queue_results.recv(task.id);
-            llama.queue_results.remove_waiting_task_id(task.id);
-
-            res.set_content(result.result_json["slots"].dump(), "application/json");
-            res.status = 200; // HTTP OK
-        });
-    }
-
-    if (sparams.metrics_endpoint) {
-        svr.Get("/metrics", [&](const httplib::Request&, httplib::Response& res) {
-            // request slots data using task queue
-            task_server task;
-            task.id = llama.queue_tasks.get_new_id();
-            task.type = TASK_TYPE_METRICS;
-            task.target_id = -1;
-
-            llama.queue_results.add_waiting_task_id(task.id);
-            llama.queue_tasks.post(task);
-
-            // get the result
-            task_result result = llama.queue_results.recv(task.id);
-            llama.queue_results.remove_waiting_task_id(task.id);
-
-            json data = result.result_json;
-
-            uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
-            uint64_t t_prompt_processing       = data["t_prompt_processing"];
-
-            uint64_t n_tokens_predicted       = data["n_tokens_predicted"];
-            uint64_t t_tokens_generation      = data["t_tokens_generation"];
-
-            int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
-
-            // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
-            json all_metrics_def = json {
-                    {"counter", {{
-                            {"name",  "prompt_tokens_total"},
-                            {"help",  "Number of prompt tokens processed."},
-                            {"value",  data["n_prompt_tokens_processed_total"]}
-                    }, {
-                            {"name",  "tokens_predicted_total"},
-                            {"help",  "Number of generation tokens processed."},
-                            {"value",  data["n_tokens_predicted_total"]}
-                    }}},
-                    {"gauge", {{
-                            {"name",  "prompt_tokens_seconds"},
-                            {"help",  "Average prompt throughput in tokens/s."},
-                            {"value",  n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0}
-                    },{
-                            {"name",  "predicted_tokens_seconds"},
-                            {"help",  "Average generation throughput in tokens/s."},
-                            {"value",  n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0}
-                     },{
-                            {"name",  "kv_cache_usage_ratio"},
-                            {"help",  "KV-cache usage. 1 means 100 percent usage."},
-                            {"value",  1. * kv_cache_used_cells / params.n_ctx}
-                     },{
-                            {"name",  "kv_cache_tokens"},
-                            {"help",  "KV-cache tokens."},
-                            {"value",  data["kv_cache_tokens_count"]}
-                    },{
-                            {"name",  "requests_processing"},
-                            {"help",  "Number of request processing."},
-                            {"value",  data["processing"]}
-                  },{
-                            {"name",  "requests_deferred"},
-                            {"help",  "Number of request deferred."},
-                            {"value",  data["deferred"]}
-                  }}}
-            };
-
-            std::stringstream prometheus;
-            for (const auto& el : all_metrics_def.items()) {
-                const auto& type = el.key();
-                const auto& metrics_def = el.value();
-                for (const auto& metric_def : metrics_def) {
-                    std::string name = metric_def["name"];
-                    std::string help = metric_def["help"];
-                    auto value = json_value(metric_def, "value", 0);
-                    prometheus << "# HELP llamacpp:" << name << " " << help  << "\n"
-                               << "# TYPE llamacpp:" << name << " " << type  << "\n"
-                               << "llamacpp:"        << name << " " << value << "\n";
-                }
-            }
-
-            res.set_content(prometheus.str(), "text/plain; version=0.0.4");
-            res.status = 200; // HTTP OK
-        });
-    }
-
-    svr.set_logger(log_server_request);
-
-    svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
-            {
-                const char fmt[] = "500 Internal Server Error\n%s";
-                char buf[BUFSIZ];
-                try
-                {
-                    std::rethrow_exception(std::move(ep));
-                }
-                catch (std::exception &e)
-                {
-                    snprintf(buf, sizeof(buf), fmt, e.what());
-                }
-                catch (...)
-                {
-                    snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
-                }
-                res.set_content(buf, "text/plain; charset=utf-8");
-                res.status = 500;
-            });
-
-    svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
-            {
-                if (res.status == 401)
-                {
-                    res.set_content("Unauthorized", "text/plain; charset=utf-8");
-                }
-                if (res.status == 400)
-                {
-                    res.set_content("Invalid request", "text/plain; charset=utf-8");
-                }
-                else if (res.status == 404)
-                {
-                    res.set_content("File Not Found", "text/plain; charset=utf-8");
-                    res.status = 404;
-                }
-            });
-
-    // set timeouts and change hostname and port
-    svr.set_read_timeout (sparams.read_timeout);
-    svr.set_write_timeout(sparams.write_timeout);
-
-    if (!svr.bind_to_port(sparams.hostname, sparams.port))
-    {
-        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
-        return 1;
-    }
-
-    // Set the base directory for serving static files
-    svr.set_base_dir(sparams.public_path);
-
-    std::unordered_map<std::string, std::string> log_data;
-    log_data["hostname"] = sparams.hostname;
-    log_data["port"] = std::to_string(sparams.port);
-
-    if (sparams.api_keys.size() == 1) {
-        log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4);
-    } else if (sparams.api_keys.size() > 1) {
-        log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
-    }
-
-    if (sparams.n_threads_http < 1) {
-        // +2 threads for monitoring endpoints
-        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
-    }
-    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
-    svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
-
-    LOG_INFO("HTTP server listening", log_data);
-    // run the HTTP server in a thread - see comment below
-    std::thread t([&]()
-            {
-                if (!svr.listen_after_bind())
-                {
-                    state.store(SERVER_STATE_ERROR);
-                    return 1;
-                }
-
-                return 0;
-            });
-
-    // load the model
-    params.progress_callback = update_load_progress;
-    params.progress_callback_user_data = (void*)&llama;
-
-    if (!llama.load_model(params))
-    {
-        state.store(SERVER_STATE_ERROR);
-        return 1;
-    } else {
-        llama.initialize();
-        state.store(SERVER_STATE_READY);
-        LOG_INFO("model loaded", {});
-    }
-    const auto model_meta = llama.model_meta();
-
-    // Middleware for API key validation
-    auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
-        // If API key is not set, skip validation
-        if (sparams.api_keys.empty()) {
-            return true;
-        }
-
-        // Check for API key in the header
-        auto auth_header = req.get_header_value("Authorization");
-        std::string prefix = "Bearer ";
-        if (auth_header.substr(0, prefix.size()) == prefix) {
-            std::string received_api_key = auth_header.substr(prefix.size());
-            if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
-                return true; // API key is valid
-            }
-        }
-
-        // API key is invalid or not provided
-        res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8");
-        res.status = 401; // Unauthorized
-
-        LOG_WARNING("Unauthorized: Invalid API Key", {});
-
-        return false;
-    };
-
-    // this is only called if no index.html is found in the public --path
-    svr.Get("/", [](const httplib::Request &, httplib::Response &res)
-            {
-                res.set_content("server running", "text/plain; charset=utf-8");
-                res.status = 200; // Unauthorized
-                return true;
-            });
-
-    svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                if (!validate_api_key(req, res)) {
-                    return;
-                }
-                json data = json::parse(req.body);
-                const int task_id = llama.queue_tasks.get_new_id();
-                llama.queue_results.add_waiting_task_id(task_id);
-                llama.request_completion(task_id, data, false, -1);
-                if (!json_value(data, "stream", false)) {
-                    std::string completion_text;
-                    task_result result = llama.queue_results.recv(task_id);
-                    if (!result.error && result.stop) {
-                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
-                    }
-                    else
-                    {
-                        res.status = 404;
-                        res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
-                    }
-                    llama.queue_results.remove_waiting_task_id(task_id);
-                } else {
-                    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink)
-                    {
-                        while (true)
-                        {
-                            task_result result = llama.queue_results.recv(task_id);
-                            if (!result.error) {
-                                const std::string str =
-                                    "data: " +
-                                    result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                    "\n\n";
-                                LOG_VERBOSE("data stream", {
-                                    { "to_send", str }
-                                });
-                                if (!sink.write(str.c_str(), str.size()))
-                                {
-                                    llama.queue_results.remove_waiting_task_id(task_id);
-                                    return false;
-                                }
-                                if (result.stop) {
-                                    break;
-                                }
-                            } else {
-                                const std::string str =
-                                    "error: " +
-                                    result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                    "\n\n";
-                                LOG_VERBOSE("data stream", {
-                                    { "to_send", str }
-                                });
-                                if (!sink.write(str.c_str(), str.size()))
-                                {
-                                    llama.queue_results.remove_waiting_task_id(task_id);
-                                    return false;
-                                }
-                                break;
-                            }
-                        }
-
-                        llama.queue_results.remove_waiting_task_id(task_id);
-                        sink.done();
-                        return true;
-                    };
-
-                    auto on_complete = [task_id, &llama] (bool)
-                    {
-                        // cancel
-                        llama.request_cancel(task_id);
-                        llama.queue_results.remove_waiting_task_id(task_id);
-                    };
-
-                    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-                }
-            });
-
-    svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                const json body = json::parse(req.body);
-                std::vector<llama_token> tokens;
-                if (body.count("content") != 0)
-                {
-                    tokens = llama.tokenize(body["content"], false);
-                }
-                const json data = format_tokenizer_response(tokens);
-                return res.set_content(data.dump(), "application/json; charset=utf-8");
-            });
-
-    svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                const json body = json::parse(req.body);
-                std::string content;
-                if (body.count("tokens") != 0)
-                {
-                    const std::vector<llama_token> tokens = body["tokens"];
-                    content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
-                }
-
-                const json data = format_detokenized_response(content);
-                return res.set_content(data.dump(), "application/json; charset=utf-8");
-            });
-
-    svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                const json body = json::parse(req.body);
-                json prompt;
-                if (body.count("content") != 0)
-                {
-                    prompt = body["content"];
-                }
-                else
-                {
-                    prompt = "";
-                }
-
-                // create and queue the task
-                const int task_id = llama.queue_tasks.get_new_id();
-                llama.queue_results.add_waiting_task_id(task_id);
-                llama.request_completion(task_id, {{"prompt", prompt}}, true, -1);
-
-                // get the result
-                task_result result = llama.queue_results.recv(task_id);
-                llama.queue_results.remove_waiting_task_id(task_id);
-
-                // send the result
-                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
-            });
-
-    // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
-    //     "Bus error: 10" - this is on macOS, it does not crash on Linux
-    //std::thread t2([&]()
-    /*{
-        bool running = true;
-        while (running)
-        {
-            running = llama.update_slots();
-        }
-    }*/
-    //);
-
-    llama.queue_tasks.on_new_task(std::bind(
-        &llama_server_context::process_single_task, &llama, std::placeholders::_1));
-    llama.queue_tasks.on_finish_multitask(std::bind(
-        &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
-    llama.queue_tasks.on_run_slots(std::bind(
-        &llama_server_context::update_slots, &llama));
-    llama.queue_results.on_multitask_update(std::bind(
-        &llama_server_queue::update_multitask,
-        &llama.queue_tasks,
-        std::placeholders::_1,
-        std::placeholders::_2,
-        std::placeholders::_3
-    ));
-
-    shutdown_handler = [&](int) {
-        llama.queue_tasks.terminate();
-    };
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-    struct sigaction sigint_action;
-    sigint_action.sa_handler = signal_handler;
-    sigemptyset (&sigint_action.sa_mask);
-    sigint_action.sa_flags = 0;
-    sigaction(SIGINT, &sigint_action, NULL);
-#elif defined (_WIN32)
-    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
-    };
-    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
-
-    for (int i = 0; i < argc; ++i) {
-        delete[] argv[i];
-    }
-    delete[] argv;
-#endif
-    llama.queue_tasks.start_loop();
-    svr.stop();
-    t.join();
-
-    llama_backend_free();
-    return 0;
-}
diff --git a/llm/ext_server/utils.hpp b/llm/ext_server/utils.hpp
deleted file mode 100644
index fc5d301e..00000000
--- a/llm/ext_server/utils.hpp
+++ /dev/null
@@ -1,661 +0,0 @@
-// MIT License
-
-// Copyright (c) 2023 Georgi Gerganov
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include <set>
-#include <mutex>
-#include <condition_variable>
-#include <unordered_map>
-#include <random>
-#include <iostream>
-#include <thread>
-
-#include "json.hpp"
-
-#include "../llava/clip.h"
-
-using json = nlohmann::json;
-
-extern bool server_verbose;
-extern bool server_log_json;
-
-#ifndef SERVER_VERBOSE
-#define SERVER_VERBOSE 1
-#endif
-
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                            \
-    do                                                                   \
-    {                                                                    \
-        if (server_verbose)                                              \
-        {                                                                \
-            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
-        }                                                                \
-    } while (0)
-#endif
-
-#define LOG_ERROR(  MSG, ...) server_log("ERROR",  __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_DEBUG(  MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__)
-
-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-    SERVER_STATE_ERROR           // An error occurred, load_model failed
-};
-
-enum task_type {
-    TASK_TYPE_COMPLETION,
-    TASK_TYPE_CANCEL,
-    TASK_TYPE_NEXT_RESPONSE,
-    TASK_TYPE_METRICS
-};
-
-struct task_server {
-    int id = -1; // to be filled by llama_server_queue
-    int target_id;
-    task_type type;
-    json data;
-    bool infill_mode = false;
-    bool embedding_mode = false;
-    int multitask_id = -1;
-};
-
-struct task_result {
-    int id;
-    int multitask_id = -1;
-    bool stop;
-    bool error;
-    json result_json;
-};
-
-struct task_multi {
-    int id;
-    std::set<int> subtasks_remaining{};
-    std::vector<task_result> results{};
-};
-
-// completion token output with probabilities
-struct completion_token_output {
-    struct token_prob
-    {
-        llama_token tok;
-        float prob;
-    };
-
-    std::vector<token_prob> probs;
-    llama_token tok;
-    std::string text_to_send;
-};
-
-struct token_translator {
-    llama_context * ctx;
-    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
-    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
-};
-
-static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
-    std::stringstream ss_tid;
-    ss_tid << std::this_thread::get_id();
-    json log = nlohmann::ordered_json{
-        {"tid", ss_tid.str()},
-        {"timestamp", time(nullptr)},
-    };
-
-    if (strncmp("DEBUG", level, strlen(level)) == 0 && !server_verbose) {
-        return;
-    }
-
-    if (server_log_json) {
-        log.merge_patch(
-                {
-                        {"level",     level},
-                        {"function",  function},
-                        {"line",      line},
-                        {"msg",       message},
-                });
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-
-        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
-    } else {
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-
-        std::stringstream ss;
-        ss << level << " [" << function << "] " << message << " |";
-        for (const auto& el : log.items())
-        {
-            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
-            ss << " " << el.key() << "=" << value;
-        }
-
-        const std::string str = ss.str();
-        printf("%.*s\n", (int)str.size(), str.data());
-        fflush(stdout);
-    }
-}
-
-//
-// server utils
-//
-
-template <typename T>
-static T json_value(const json &body, const std::string &key, const T &default_value) {
-    // Fallback null to default value
-    return body.contains(key) && !body.at(key).is_null()
-        ? body.value(key, default_value)
-        : default_value;
-}
-
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-inline bool verify_custom_template(const std::string & tmpl) {
-    llama_chat_message chat[] = {{"user", "test"}};
-    std::vector<char> buf(1);
-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
-    return res >= 0;
-}
-
-// Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
-    size_t alloc_size = 0;
-    // vector holding all allocated string to be passed to llama_chat_apply_template
-    std::vector<std::string> str(messages.size() * 2);
-    std::vector<llama_chat_message> chat(messages.size());
-
-    for (size_t i = 0; i < messages.size(); ++i) {
-        auto &curr_msg = messages[i];
-        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
-        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
-        alloc_size     += str[i*2 + 1].length();
-        chat[i].role    = str[i*2 + 0].c_str();
-        chat[i].content = str[i*2 + 1].c_str();
-    }
-
-    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
-    std::vector<char> buf(alloc_size * 2);
-
-    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-
-    // if it turns out that our buffer is too small, we resize it
-    if ((size_t) res > buf.size()) {
-        buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-    }
-
-    std::string formatted_chat(buf.data(), res);
-    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
-
-    return formatted_chat;
-}
-
-//
-// work queue utils
-//
-
-struct llama_server_queue {
-    int id = 0;
-    std::mutex mutex_tasks;
-    bool running;
-    // queues
-    std::vector<task_server> queue_tasks;
-    std::vector<task_server> queue_tasks_deferred;
-    std::vector<task_multi> queue_multitasks;
-    std::condition_variable condition_tasks;
-    // callback functions
-    std::function<void(task_server&)> callback_new_task;
-    std::function<void(task_multi&)> callback_finish_multitask;
-    std::function<void(void)> callback_run_slots;
-
-    // Add a new task to the end of the queue
-    int post(task_server task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        if (task.id == -1) {
-            task.id = id++;
-            LOG_VERBOSE("new task id", {{"new_id", task.id}});
-        }
-        queue_tasks.push_back(std::move(task));
-        condition_tasks.notify_one();
-        return task.id;
-    }
-
-    // Add a new task, but defer until one slot is available
-    void defer(task_server task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        queue_tasks_deferred.push_back(std::move(task));
-    }
-
-    // Get the next id for creating anew task
-    int get_new_id() {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        int new_id = id++;
-        LOG_VERBOSE("new task id", {{"new_id", new_id}});
-        return new_id;
-    }
-
-    // Register function to process a new task
-    void on_new_task(std::function<void(task_server&)> callback) {
-        callback_new_task = callback;
-    }
-
-    // Register function to process a multitask when it is finished
-    void on_finish_multitask(std::function<void(task_multi&)> callback) {
-        callback_finish_multitask = callback;
-    }
-
-    // Register the function to be called when all slots data is ready to be processed
-    void on_run_slots(std::function<void(void)> callback) {
-        callback_run_slots = callback;
-    }
-
-    // Call when the state of one slot is changed
-    void notify_slot_changed() {
-        // move deferred tasks back to main loop
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        for (auto & task : queue_tasks_deferred) {
-            queue_tasks.push_back(std::move(task));
-        }
-        queue_tasks_deferred.clear();
-    }
-
-    // end the start_loop routine
-    void terminate() {
-        {
-            std::unique_lock<std::mutex> lock(mutex_tasks);
-            running = false;
-        }
-        condition_tasks.notify_all();
-    }
-
-    /**
-     * Main loop consists of these steps:
-     * - Wait until a new task arrives
-     * - Process the task (i.e. maybe copy data into slot)
-     * - Check if multitask is finished
-     * - Run all slots
-     */
-    void start_loop() {
-        running = true;
-        while (true) {
-            LOG_VERBOSE("new task may arrive", {});
-            {
-                while (true)
-                {
-                    std::unique_lock<std::mutex> lock(mutex_tasks);
-                    if (queue_tasks.empty()) {
-                        lock.unlock();
-                        break;
-                    }
-                    task_server task = queue_tasks.front();
-                    queue_tasks.erase(queue_tasks.begin());
-                    lock.unlock();
-                    LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
-                    callback_new_task(task);
-                }
-                LOG_VERBOSE("update_multitasks", {});
-                // check if we have any finished multitasks
-                auto queue_iterator = queue_multitasks.begin();
-                while (queue_iterator != queue_multitasks.end())
-                {
-                    if (queue_iterator->subtasks_remaining.empty())
-                    {
-                        // all subtasks done == multitask is done
-                        task_multi current_multitask = *queue_iterator;
-                        callback_finish_multitask(current_multitask);
-                        // remove this multitask
-                        queue_iterator = queue_multitasks.erase(queue_iterator);
-                    }
-                    else
-                    {
-                        ++queue_iterator;
-                    }
-                }
-                // all tasks in the current loop is processed, slots data is now ready
-                LOG_VERBOSE("callback_run_slots", {});
-                callback_run_slots();
-            }
-            LOG_VERBOSE("wait for new task", {});
-            // wait for new task
-            {
-                std::unique_lock<std::mutex> lock(mutex_tasks);
-                if (queue_tasks.empty()) {
-                    if (!running) {
-                        LOG_VERBOSE("ending start_loop", {});
-                        return;
-                    }
-                    condition_tasks.wait(lock, [&]{
-                        return (!queue_tasks.empty() || !running);
-                    });
-                }
-            }
-        }
-    }
-
-    //
-    // functions to manage multitasks
-    //
-
-    // add a multitask by specifying the id of all subtask (subtask is a task_server)
-    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        task_multi multi;
-        multi.id = multitask_id;
-        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
-        queue_multitasks.push_back(multi);
-    }
-
-    // updatethe remaining subtasks, while appending results to multitask
-    void update_multitask(int multitask_id, int subtask_id, task_result& result)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        for (auto& multitask : queue_multitasks)
-        {
-            if (multitask.id == multitask_id)
-            {
-                multitask.subtasks_remaining.erase(subtask_id);
-                multitask.results.push_back(result);
-            }
-        }
-    }
-};
-
-struct llama_server_response {
-    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
-    callback_multitask_t callback_update_multitask;
-    // for keeping track of all tasks waiting for the result
-    std::set<int> waiting_task_ids;
-    // the main result queue
-    std::vector<task_result> queue_results;
-    std::mutex mutex_results;
-    std::condition_variable condition_results;
-
-    // add the task_id to the list of tasks waiting for response
-    void add_waiting_task_id(int task_id) {
-        LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.insert(task_id);
-    }
-
-    // when the request is finished, we can remove task associated with it
-    void remove_waiting_task_id(int task_id) {
-        LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.erase(task_id);
-    }
-
-    // This function blocks the thread until there is a response for this task_id
-    task_result recv(int task_id) {
-        while (true)
-        {
-            std::unique_lock<std::mutex> lock(mutex_results);
-            condition_results.wait(lock, [&]{
-                return !queue_results.empty();
-            });
-
-            for (int i = 0; i < (int) queue_results.size(); i++)
-            {
-                if (queue_results[i].id == task_id)
-                {
-                    assert(queue_results[i].multitask_id == -1);
-                    task_result res = queue_results[i];
-                    queue_results.erase(queue_results.begin() + i);
-                    return res;
-                }
-            }
-        }
-
-        // should never reach here
-    }
-
-    // Register the function to update multitask
-    void on_multitask_update(callback_multitask_t callback) {
-        callback_update_multitask = callback;
-    }
-
-    // Send a new result to a waiting task_id
-    void send(task_result result) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-        LOG_VERBOSE("send new result", {{"task_id", result.id}});
-        for (auto& task_id : waiting_task_ids) {
-            // LOG_TEE("waiting task id %i \n", task_id);
-            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
-            if (result.multitask_id == task_id)
-            {
-                LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
-                callback_update_multitask(task_id, result.id, result);
-                continue;
-            }
-
-            if (result.id == task_id)
-            {
-                LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
-                queue_results.push_back(result);
-                condition_results.notify_all();
-                return;
-            }
-        }
-    }
-};
-
-//
-// base64 utils (TODO: move to common in the future)
-//
-
-static const std::string base64_chars =
-             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "abcdefghijklmnopqrstuvwxyz"
-             "0123456789+/";
-
-static inline bool is_base64(uint8_t c)
-{
-    return (isalnum(c) || (c == '+') || (c == '/'));
-}
-
-static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
-{
-    int i = 0;
-    int j = 0;
-    int in_ = 0;
-
-    int in_len = encoded_string.size();
-
-    uint8_t char_array_4[4];
-    uint8_t char_array_3[3];
-
-    std::vector<uint8_t> ret;
-
-    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
-    {
-        char_array_4[i++] = encoded_string[in_]; in_++;
-        if (i == 4)
-        {
-            for (i = 0; i <4; i++)
-            {
-                char_array_4[i] = base64_chars.find(char_array_4[i]);
-            }
-
-            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-
-            for (i = 0; (i < 3); i++)
-            {
-                ret.push_back(char_array_3[i]);
-            }
-            i = 0;
-        }
-    }
-
-    if (i)
-    {
-        for (j = i; j <4; j++)
-        {
-            char_array_4[j] = 0;
-        }
-
-        for (j = 0; j <4; j++)
-        {
-            char_array_4[j] = base64_chars.find(char_array_4[j]);
-        }
-
-        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-
-        for (j = 0; (j < i - 1); j++)
-        {
-            ret.push_back(char_array_3[j]);
-        }
-    }
-
-    return ret;
-}
-
-//
-// random string / id
-//
-
-static std::string random_string()
-{
-    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
-
-    std::random_device rd;
-    std::mt19937 generator(rd());
-
-    std::string result(32, ' ');
-
-    for (int i = 0; i < 32; ++i) {
-        result[i] = str[generator() % str.size()];
-    }
-
-    return result;
-}
-
-static std::string gen_chatcmplid()
-{
-    std::stringstream chatcmplid;
-    chatcmplid << "chatcmpl-" << random_string();
-    return chatcmplid.str();
-}
-
-//
-// other common utils
-//
-
-static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
-{
-    size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
-    {
-    }
-    return i;
-}
-
-static bool ends_with(const std::string &str, const std::string &suffix)
-{
-    return str.size() >= suffix.size() &&
-           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
-
-static size_t find_partial_stop_string(const std::string &stop,
-                                       const std::string &text)
-{
-    if (!text.empty() && !stop.empty())
-    {
-        const char text_last_char = text.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
-        {
-            if (stop[char_index] == text_last_char)
-            {
-                const std::string current_partial = stop.substr(0, char_index + 1);
-                if (ends_with(text, current_partial))
-                {
-                    return text.size() - char_index - 1;
-                }
-            }
-        }
-    }
-    return std::string::npos;
-}
-
-// TODO: reuse llama_detokenize
-template <class Iter>
-static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
-{
-    std::string ret;
-    for (; begin != end; ++begin)
-    {
-        ret += llama_token_to_piece(ctx, *begin);
-    }
-    return ret;
-}
-
-// format incomplete utf-8 multibyte character for output
-static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
-{
-    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
-    // if the size is 1 and first bit is 1, meaning it's a partial character
-    //   (size > 1 meaning it's already a known token)
-    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
-    {
-        std::stringstream ss;
-        ss << std::hex << (out[0] & 0xff);
-        std::string res(ss.str());
-        out = "byte: \\x" + res;
-    }
-    return out;
-}
-
-// convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
-{
-    json out = json::array();
-    for (const auto &prob : probs)
-    {
-        json probs_for_token = json::array();
-        for (const auto &p : prob.probs)
-        {
-            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-            probs_for_token.push_back(json
-            {
-                {"tok_str", tok_str},
-                {"prob",    p.prob},
-            });
-        }
-        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
-        out.push_back(json{
-            {"content", tok_str},
-            {"probs",   probs_for_token},
-        });
-    }
-    return out;
-}
diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh
deleted file mode 100644
index e58c2d40..00000000
--- a/llm/generate/gen_common.sh
+++ /dev/null
@@ -1,137 +0,0 @@
-# common logic across linux and darwin
-
-init_vars() {
-    case "${GOARCH}" in
-    "amd64")
-        ARCH="x86_64"
-        ;;
-    "arm64")
-        ARCH="arm64"
-        ;;
-    *)
-        echo "GOARCH must be set"
-        echo "this script is meant to be run from within go generate"
-        exit 1
-        ;;
-    esac
-
-    LLAMACPP_DIR=../llama.cpp
-    CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
-    CMAKE_TARGETS="--target ollama_llama_server"
-    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
-    else
-        # TODO - add additional optimization flags...
-        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
-    fi
-    case $(uname -s) in
-    "Darwin")
-        LIB_EXT="dylib"
-        WHOLE_ARCHIVE="-Wl,-force_load"
-        NO_WHOLE_ARCHIVE=""
-        GCC_ARCH="-arch ${ARCH}"
-        DIST_BASE=../../dist/darwin-${GOARCH}/
-        PAYLOAD_BASE=../../build/darwin/${GOARCH}
-        ;;
-    "Linux")
-        LIB_EXT="so"
-        WHOLE_ARCHIVE="-Wl,--whole-archive"
-        NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive"
-
-        # Cross compiling not supported on linux - Use docker
-        GCC_ARCH=""
-        DIST_BASE=../../dist/linux-${GOARCH}/
-        PAYLOAD_BASE=../../build/linux/${GOARCH}
-        ;;
-    *)
-        ;;
-    esac
-    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
-        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
-    fi
-    GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
-    RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
-}
-
-git_module_setup() {
-    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
-        echo "Skipping submodule initialization"
-        return
-    fi
-    # Make sure the tree is clean after the directory moves
-    if [ -d "${LLAMACPP_DIR}/gguf" ]; then
-        echo "Cleaning up old submodule"
-        rm -rf ${LLAMACPP_DIR}
-    fi
-    git submodule init
-    git submodule update --force ${LLAMACPP_DIR}
-
-}
-
-apply_patches() {
-    # apply temporary patches until fix is upstream
-    for patch in ../patches/*.patch; do
-        git -c 'user.name=nobody' -c 'user.email=<>' -C ${LLAMACPP_DIR} am ${patch}
-    done
-}
-
-build() {
-    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
-    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
-    # remove unnecessary build artifacts
-    rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
-}
-
-dist() {
-    [ -z "${RUNNER}" ] && exit 1
-    mkdir -p ${RUNNER_BASE}/${RUNNER}/
-    for f in ${BUILD_DIR}/bin/* ; do
-        cp ${f} ${RUNNER_BASE}/${RUNNER}/
-    done
-    # check for lib directory
-    if [ -d ${BUILD_DIR}/lib ]; then
-        for f in ${BUILD_DIR}/lib/* ; do
-            cp ${f} ${RUNNER_BASE}/${RUNNER}/
-        done
-    fi
-}
-
-# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
-compress() {
-    [ -z "${RUNNER}" ] && exit 1
-    echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
-    rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
-    mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
-    for f in ${BUILD_DIR}/bin/* ; do
-        ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
-        compress_pids+=" $!"
-    done
-    # check for lib directory
-    if [ -d ${BUILD_DIR}/lib ]; then
-        for f in ${BUILD_DIR}/lib/* ; do
-            ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
-            compress_pids+=" $!"
-        done
-    fi
-    echo
-}
-
-wait_for_compress() {
-    for pid in ${compress_pids}; do
-        wait $pid
-    done
-    echo "Finished compression"
-}
-
-install() {
-    echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
-    for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do
-        rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
-        cp -af "${lib}" "${BUILD_DIR}/bin/"
-    done
-}
-
-# Keep the local tree clean after we're done with the build
-cleanup() {
-    git submodule update --force ${LLAMACPP_DIR}
-}
diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
deleted file mode 100755
index 3c7c71ea..00000000
--- a/llm/generate/gen_darwin.sh
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/bin/bash
-# This script is intended to run inside the go generate
-# working directory must be ./llm/generate/
-
-# TODO - add hardening to detect missing tools (cmake, etc.)
-
-set -ex
-set -o pipefail
-compress_pids=""
-echo "Starting darwin generate script"
-source $(dirname $0)/gen_common.sh
-init_vars
-git_module_setup
-apply_patches
-
-sign() {
-    if [ -n "$APPLE_IDENTITY" ]; then
-        codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1
-    fi
-}
-
-COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
-
-case "${GOARCH}" in
-"amd64")
-    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DGGML_METAL=off -DGGML_NATIVE=off"
-
-    if [ -z "$OLLAMA_SKIP_CPU_GENERATE" ]; then
-        #
-        # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
-        #
-        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        RUNNER=cpu
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
-        echo "Building LCD CPU"
-        build
-        sign ${BUILD_DIR}/bin/ollama_llama_server
-        compress
-
-        #
-        # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
-        # Approximately 400% faster than LCD on same CPU
-        #
-        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        RUNNER=cpu_avx
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
-        echo "Building AVX CPU"
-        build
-        sign ${BUILD_DIR}/bin/ollama_llama_server
-        compress
-
-        #
-        # ~2013 CPU Dynamic library
-        # Approximately 10% faster than AVX on same CPU
-        #
-        init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-        RUNNER=cpu_avx2
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
-        echo "Building AVX2 CPU"
-        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
-        build
-        sign ${BUILD_DIR}/bin/ollama_llama_server
-        compress
-    fi
-    ;;
-"arm64")
-
-    if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
-        init_vars
-        CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
-        RUNNER="metal"
-        BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
-        EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
-        build
-        sign ${BUILD_DIR}/bin/ollama_llama_server
-        compress
-    fi
-    ;;
-*)
-    echo "GOARCH must be set"
-    echo "this script is meant to be run from within go generate"
-    exit 1
-    ;;
-esac
-
-cleanup
-wait_for_compress
-echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
deleted file mode 100755
index cacaaf4b..00000000
--- a/llm/generate/gen_linux.sh
+++ /dev/null
@@ -1,285 +0,0 @@
-#!/bin/bash
-# This script is intended to run inside the go generate
-# working directory must be llm/generate/
-
-# First we build one or more CPU based LLM libraries
-#
-# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
-# library dependencies
-#
-# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  The ROCM
-# libraries are quite large, and also dynamically load data files at runtime
-# which in turn are large, so we don't attempt to cary them as payload
-
-set -ex
-set -o pipefail
-compress_pids=""
-
-# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
-amdGPUs() {
-    if [ -n "${AMDGPU_TARGETS}" ]; then
-        echo "${AMDGPU_TARGETS}"
-        return
-    fi
-    GPU_LIST=(
-        "gfx900"
-        "gfx906:xnack-"
-        "gfx908:xnack-"
-        "gfx90a:xnack+"
-        "gfx90a:xnack-"
-        "gfx940"
-        "gfx941"
-        "gfx942"
-        "gfx1010"
-        "gfx1012"
-        "gfx1030"
-        "gfx1100"
-        "gfx1101"
-        "gfx1102"
-    )
-    (
-        IFS=$';'
-        echo "'${GPU_LIST[*]}'"
-    )
-}
-
-echo "Starting linux generate script"
-if [ -z "${CUDACXX}" ]; then
-    if [ -x /usr/local/cuda/bin/nvcc ]; then
-        export CUDACXX=/usr/local/cuda/bin/nvcc
-    else
-        # Try the default location in case it exists
-        export CUDACXX=$(command -v nvcc)
-    fi
-fi
-COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
-source $(dirname $0)/gen_common.sh
-init_vars
-git_module_setup
-apply_patches
-
-init_vars
-if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
-    # Users building from source can tune the exact flags we pass to cmake for configuring
-    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
-    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
-        init_vars
-        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
-        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        RUNNER="cpu"
-        BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-        echo "Building custom CPU"
-        build
-        install
-        dist
-        compress
-    else
-        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
-        # -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
-        # -DGGML_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
-        # -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
-        # -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
-        # Note: the following seem to yield slower results than AVX2 - ymmv
-        # -DGGML_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
-        # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
-        # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
-
-        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
-        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
-            #
-            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
-            #
-            init_vars
-            CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-            RUNNER=cpu
-            BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-            echo "Building LCD CPU"
-            build
-            install
-            dist
-            compress
-        fi
-
-        if [ "${ARCH}" == "x86_64" ]; then
-            #
-            # ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
-            #
-            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
-                #
-                # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
-                # Approximately 400% faster than LCD on same CPU
-                #
-                init_vars
-                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-                RUNNER=cpu_avx
-                BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-                echo "Building AVX CPU"
-                build
-                install
-                dist
-                compress
-            fi
-
-            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
-                #
-                # ~2013 CPU Dynamic library
-                # Approximately 10% faster than AVX on same CPU
-                #
-                init_vars
-                CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-                RUNNER=cpu_avx2
-                BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-                echo "Building AVX2 CPU"
-                build
-                install
-                dist
-                compress
-            fi
-        fi
-    fi
-else
-    echo "Skipping CPU generation step as requested"
-fi
-
-# If needed, look for the default CUDA toolkit location
-if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
-    CUDA_LIB_DIR=/usr/local/cuda/lib64
-fi
-
-# If needed, look for CUDA on Arch Linux
-if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
-    CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
-fi
-
-# Allow override in case libcudart is in the wrong place
-if [ -z "${CUDART_LIB_DIR}" ]; then
-    CUDART_LIB_DIR="${CUDA_LIB_DIR}"
-fi
-
-if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
-    echo "CUDA libraries detected - building dynamic CUDA library"
-    init_vars
-    CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
-    if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
-        CUDA_VARIANT=_v${CUDA_MAJOR}
-    fi
-    if [ "${ARCH}" == "arm64" ]; then
-        echo "ARM CPU detected - disabling unsupported AVX instructions"
-
-        # ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
-        #
-        # CUDA compute < 6.0 lacks proper FP16 support on ARM.
-        # Disabling has minimal performance effect while maintaining compatibility.
-        ARM64_DEFS="-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_CUDA_F16=off"
-    fi
-    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
-    if [ -n "${OLLAMA_CUSTOM_CUDA_DEFS}" ]; then
-        echo "OLLAMA_CUSTOM_CUDA_DEFS=\"${OLLAMA_CUSTOM_CUDA_DEFS}\""
-        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
-        echo "Building custom CUDA GPU"
-    else
-        CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
-    fi
-    export CUDAFLAGS="-t8"
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
-    RUNNER=cuda${CUDA_VARIANT}
-    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-    export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
-    CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
-    build
-    install
-    dist
-    echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
-    mkdir -p "${CUDA_DIST_DIR}"
-    for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
-        cp -a "${lib}" "${CUDA_DIST_DIR}"
-    done
-    compress
-
-fi
-
-if [ -z "${ONEAPI_ROOT}" ]; then
-    # Try the default location in case it exists
-    ONEAPI_ROOT=/opt/intel/oneapi
-fi
-
-if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
-    echo "OneAPI libraries detected - building dynamic OneAPI library"
-    init_vars
-    source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
-    CC=icx
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
-    RUNNER=oneapi
-    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-    ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
-    export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
-    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
-    build
-
-    # copy oneAPI dependencies
-    mkdir -p "${ONEAPI_DIST_DIR}"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
-        cp -a "${dep}" "${ONEAPI_DIST_DIR}"
-    done
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
-    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
-    install
-    dist
-    compress
-fi
-
-if [ -z "${ROCM_PATH}" ]; then
-    # Try the default location in case it exists
-    ROCM_PATH=/opt/rocm
-fi
-
-if [ -z "${CLBlast_DIR}" ]; then
-    # Try the default location in case it exists
-    if [ -d /usr/lib/cmake/CLBlast ]; then
-        export CLBlast_DIR=/usr/lib/cmake/CLBlast
-    fi
-fi
-
-if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
-    echo "ROCm libraries detected - building dynamic ROCm library"
-    if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
-        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
-    fi
-    init_vars
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
-    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
-    if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
-        echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
-        CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
-        echo "Building custom ROCM GPU"
-    fi
-    RUNNER=rocm${ROCM_VARIANT}
-    BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
-    # ROCm dependencies are too large to fit into a unified bundle
-    ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
-    # TODO figure out how to disable runpath (rpath)
-    # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
-    export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
-    build
-
-    # copy the ROCM dependencies
-    mkdir -p "${ROCM_DIST_DIR}"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
-        cp -a "${dep}"* "${ROCM_DIST_DIR}"
-        if [ $(readlink -f "${dep}") != "${dep}" ] ; then
-            cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
-        fi
-    done
-    install
-    dist
-    compress
-fi
-
-cleanup
-wait_for_compress
-echo "go generate completed.  LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
deleted file mode 100644
index 89d7bc25..00000000
--- a/llm/generate/gen_windows.ps1
+++ /dev/null
@@ -1,403 +0,0 @@
-#!powershell
-
-$ErrorActionPreference = "Stop"
-
-function amdGPUs {
-    if ($env:AMDGPU_TARGETS) {
-        return $env:AMDGPU_TARGETS
-    }
-    # Current supported rocblas list from ROCm v6.1.2 on windows
-    # https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus
-    $GPU_LIST = @(
-        "gfx1030"
-        "gfx1100"
-        "gfx1101"
-        "gfx1102"
-    )
-    $GPU_LIST -join ';'
-}
-
-
-function init_vars {
-    write-host "Checking for cmake..."
-    get-command cmake
-    write-host "Checking for ninja..."
-    $d=(get-command -ea 'silentlycontinue' ninja).path
-    if ($null -eq $d) {
-        $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
-        $matches=(gci -path $MSVC_INSTALL -r -fi ninja.exe)
-        if ($matches.count -eq 0) {
-            throw "Unable to locate ninja"
-        }
-        $ninjaDir=($matches[0].FullName | split-path -parent)
-        $env:PATH="$env:PATH;$ninjaDir"
-    }
-    if (!$script:SRC_DIR) {
-        $script:SRC_DIR = $(resolve-path "..\..\")
-    }
-    if (!$script:llamacppDir) {
-        $script:llamacppDir = "../llama.cpp"
-    }
-    if (!$script:cmakeTargets) {
-        $script:cmakeTargets = @("ollama_llama_server")
-    }
-    $script:cmakeDefs = @(
-        "-DBUILD_SHARED_LIBS=on",
-        "-DGGML_NATIVE=off",
-        "-DGGML_OPENMP=off"
-        )
-    $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
-    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
-    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
-    md "$script:DIST_BASE" -ea 0 > $null
-    if ($env:CGO_CFLAGS -contains "-g") {
-        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
-        $script:config = "RelWithDebInfo"
-    } else {
-        $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off", "-DCMAKE_BUILD_TYPE=Release")
-        $script:config = "Release"
-    }
-    if ($null -ne $env:CMAKE_SYSTEM_VERSION) {
-        $script:cmakeDefs += @("-DCMAKE_SYSTEM_VERSION=${env:CMAKE_SYSTEM_VERSION}")
-    }
-    # Try to find the CUDA dir
-    if ($env:CUDA_LIB_DIR -eq $null) {
-        $d=(get-command -ea 'silentlycontinue' nvcc).path
-        if ($d -ne $null) {
-            $script:CUDA_LIB_DIR=($d| split-path -parent)
-            $script:CUDA_INCLUDE_DIR=($script:CUDA_LIB_DIR|split-path -parent)+"\include"
-        }
-    } else {
-        $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
-    }
-    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
-    if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
-        $script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
-    } else {
-        $script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
-    }
-    # Note: Windows Kits 10 signtool crashes with GCP's plugin
-    if ($null -eq $env:SIGN_TOOL) {
-        ${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe"
-    } else {
-        ${script:SignTool}=${env:SIGN_TOOL}
-    }
-    if ("${env:KEY_CONTAINER}") {
-        ${script:OLLAMA_CERT}=$(resolve-path "${script:SRC_DIR}\ollama_inc.crt")
-    }
-}
-
-function git_module_setup {
-    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
-    & git submodule init
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    & git submodule update --force "${script:llamacppDir}"
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-}
-
-function apply_patches {
-    # Apply temporary patches until fix is upstream
-    foreach ($patch in $(Get-ChildItem "../patches/*.patch")) {
-        git -c 'user.name=nobody' -c 'user.email=<>' -C "${script:llamacppDir}" am $patch.FullName
-    }
-}
-
-function build {
-    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
-    & cmake --version
-    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    if ($cmakeDefs -contains "-G") {
-        $extra=@("-j8")
-    } else {
-        $extra= @("--", "/maxCpuCount:8")
-    }
-    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
-    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
-    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    # Rearrange output to be consistent between different generators
-    if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
-        mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
-        remove-item "${script:buildDir}/bin/${script:config}"
-    }
-}
-
-function sign {
-    if ("${env:KEY_CONTAINER}") {
-        write-host "Signing ${script:buildDir}/bin/*.exe  ${script:buildDir}/bin/*.dll"
-        foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
-            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
-                /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
-    }
-}
-
-function install {
-    write-host "Installing binaries to dist dir ${script:distDir}"
-    mkdir ${script:distDir} -ErrorAction SilentlyContinue
-    $binaries = dir "${script:buildDir}/bin/*.exe"
-    foreach ($file in $binaries) {
-        copy-item -Path $file -Destination ${script:distDir} -Force
-    }
-
-    write-host "Installing dlls to dist dir ${script:distDir}"
-    $dlls = dir "${script:buildDir}/bin/*.dll"
-    foreach ($file in $dlls) {
-        copy-item -Path $file -Destination ${script:distDir} -Force
-    }
-}
-
-function cleanup {
-    $patches = Get-ChildItem "../patches/*.diff"
-    foreach ($patch in $patches) {
-        # Extract file paths from the patch file
-        $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
-            $parts = $_ -split ' '
-            ($parts[1] -split '/', 2)[1]
-        }
-
-        # Checkout each file
-        foreach ($file in $filePaths) {
-            git -C "${script:llamacppDir}" checkout $file
-        }
-        git -C "${script:llamacppDir}" checkout CMakeLists.txt
-    }
-}
-
-
-# -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
-# -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
-# -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
-
-
-function build_cpu_x64 {
-    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
-        init_vars
-        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=off", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
-        $script:buildDir="../build/windows/${script:ARCH}/cpu"
-        $script:distDir="$script:DIST_BASE\cpu"
-        write-host "Building LCD CPU"
-        build
-        sign
-        install
-    } else {
-        write-host "Skipping CPU generation step as requested"
-    }
-}
-
-function build_cpu_arm64 {
-    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
-        init_vars
-        write-host "Checking for clang..."
-        get-command clang
-        $env:CFLAGS="-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only"
-        $env:CXXFLAGS="$env:CFLAGS"
-        $env:LDFLAGS="-static-libstdc++"
-        $script:cmakeDefs = $script:commonCpuDefs + @(
-            "-DCMAKE_VERBOSE_MAKEFILE=on",
-            "-DCMAKE_C_COMPILER=clang.exe",
-            "-DCMAKE_CXX_COMPILER=clang++.exe",
-            "-DMSVC_RUNTIME_LIBRARY=MultiThreaded"
-        ) + $script:cmakeDefs
-        $script:buildDir="../build/windows/${script:ARCH}/cpu"
-        $script:distDir="$script:DIST_BASE\cpu"
-        write-host "Building LCD CPU"
-        build
-        sign
-        install
-    } else {
-        write-host "Skipping CPU generation step as requested"
-    }
-}
-
-
-function build_cpu_avx() {
-    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
-        init_vars
-        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=on", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
-        $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
-        $script:distDir="$script:DIST_BASE\cpu_avx"
-        write-host "Building AVX CPU"
-        build
-        sign
-        install
-    } else {
-        write-host "Skipping CPU AVX generation step as requested"
-    }
-}
-
-function build_cpu_avx2() {
-    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
-        init_vars
-        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=on", "-DGGML_AVX2=on", "-DGGML_AVX512=off", "-DGGML_FMA=on", "-DGGML_F16C=on") + $script:cmakeDefs
-        $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
-        $script:distDir="$script:DIST_BASE\cpu_avx2"
-        write-host "Building AVX2 CPU"
-        build
-        sign
-        install
-    } else {
-        write-host "Skipping CPU AVX2 generation step as requested"
-    }
-}
-
-function build_cuda() {
-    if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
-        # Then build cuda as a dynamically loaded library
-        $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
-        $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
-        if ($null -ne $script:CUDA_VERSION) {
-            $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
-        }
-        init_vars
-        $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
-        $script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
-        $script:cmakeDefs += @(
-            "-A", "x64",
-            "-DGGML_CUDA=ON",
-            "-DGGML_AVX=on",
-            "-DGGML_AVX2=off",
-            "-DCMAKE_CUDA_FLAGS=-t6",
-            "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
-            "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
-            )
-        if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
-            write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
-            $script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
-            write-host "building custom CUDA GPU"
-        }
-        build
-        sign
-        install
-
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
-        write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    } else {
-        write-host "Skipping CUDA generation step"
-    }
-}
-
-function build_oneapi() {
-  if ((-not "${env:OLLAMA_SKIP_ONEAPI_GENERATE}") -and ("${env:ONEAPI_ROOT}"))  {
-    # Get oneAPI version
-    $script:ONEAPI_VERSION = icpx --version
-    $script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?<version>\d+\.\d+\.\d+)').Value
-    if ($null -ne $script:ONEAPI_VERSION) {
-      $script:ONEAPI_VARIANT = "_v" + $script:ONEAPI_VERSION
-    }
-    init_vars
-    $script:buildDir = "../build/windows/${script:ARCH}/oneapi$script:ONEAPI_VARIANT"
-    $script:distDir ="$script:DIST_BASE\oneapi$script:ONEAPI_VARIANT"
-    $script:cmakeDefs += @(
-      "-G", "MinGW Makefiles",
-      "-DGGML_SYCL=ON",
-      "-DCMAKE_C_COMPILER=icx",
-      "-DCMAKE_CXX_COMPILER=icx",
-      "-DCMAKE_BUILD_TYPE=Release"
-    )
-
-    Write-Host "Building oneAPI"
-    build
-    # Ninja doesn't prefix with config name
-    if ($null -ne $script:DUMPBIN) {
-      & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | Select-String ".dll"
-    }
-    sign
-    install
-
-    md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-  } else {
-    Write-Host "Skipping oneAPI generation step"
-  }
-}
-
-function build_rocm() {
-    if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
-        $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
-        if ($null -ne $script:ROCM_VERSION) {
-            $script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
-        }
-
-        init_vars
-        $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
-        $script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
-        $script:cmakeDefs += @(
-            "-G", "Ninja",
-            "-DCMAKE_C_COMPILER=clang.exe",
-            "-DCMAKE_CXX_COMPILER=clang++.exe",
-            "-DGGML_HIPBLAS=on",
-            "-DHIP_PLATFORM=amd",
-            "-DGGML_AVX=on",
-            "-DGGML_AVX2=off",
-            "-DCMAKE_POSITION_INDEPENDENT_CODE=on",
-            "-DAMDGPU_TARGETS=$(amdGPUs)",
-            "-DGPU_TARGETS=$(amdGPUs)"
-            )
-
-        # Make sure the ROCm binary dir is first in the path
-        $env:PATH="$env:HIP_PATH\bin;$env:PATH"
-
-        # We have to clobber the LIB var from the developer shell for clang to work properly
-        $env:LIB=""
-        if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
-            write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
-            $script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
-            write-host "building custom ROCM GPU"
-        }
-        write-host "Building ROCm"
-        build
-        # Ninja doesn't prefix with config name
-        ${script:config}=""
-        if ($null -ne $script:DUMPBIN) {
-            & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
-        }
-        sign
-        install
-
-        md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
-        cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
-        # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
-        cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
-    } else {
-        write-host "Skipping ROCm generation step"
-    }
-}
-
-init_vars
-if ($($args.count) -eq 0) {
-    git_module_setup
-    apply_patches
-    if ($script:ARCH -eq "arm64") {
-        build_cpu_arm64
-    } else { # amd64
-        build_cpu_x64
-        build_cpu_avx
-        build_cpu_avx2
-        build_cuda
-        build_oneapi
-        build_rocm
-    }
-
-    cleanup
-    write-host "`ngo generate completed.  LLM runners: $(get-childitem -path $script:DIST_BASE)"
-} else {
-    for ( $i = 0; $i -lt $args.count; $i++ ) {
-        write-host "performing $($args[$i])"
-        & $($args[$i])
-    }
-}
\ No newline at end of file
diff --git a/llm/generate/generate_darwin.go b/llm/generate/generate_darwin.go
deleted file mode 100644
index 77685234..00000000
--- a/llm/generate/generate_darwin.go
+++ /dev/null
@@ -1,3 +0,0 @@
-package generate
-
-//go:generate bash ./gen_darwin.sh
diff --git a/llm/generate/generate_linux.go b/llm/generate/generate_linux.go
deleted file mode 100644
index 2b7e116d..00000000
--- a/llm/generate/generate_linux.go
+++ /dev/null
@@ -1,3 +0,0 @@
-package generate
-
-//go:generate bash ./gen_linux.sh
diff --git a/llm/generate/generate_windows.go b/llm/generate/generate_windows.go
deleted file mode 100644
index d2ee5428..00000000
--- a/llm/generate/generate_windows.go
+++ /dev/null
@@ -1,3 +0,0 @@
-package generate
-
-//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
diff --git a/llm/llama.cpp b/llm/llama.cpp
deleted file mode 160000
index 3f1ae2e3..00000000
--- a/llm/llama.cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555
diff --git a/llm/patches/0000-cmakelist.patch b/llm/patches/0000-cmakelist.patch
deleted file mode 100644
index 02850038..00000000
--- a/llm/patches/0000-cmakelist.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-From 7a3555098d4591c9b329c677654497ed8cee07ec Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Fri, 23 Aug 2024 11:27:48 -0700
-Subject: [PATCH] patch cmakelist
-
----
- CMakeLists.txt | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 415743c2..aaadd13e 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -210,3 +210,5 @@ if (LLAMA_BUILD_EXAMPLES)
-     add_subdirectory(examples)
-     add_subdirectory(pocs)
- endif()
-+
-+add_subdirectory(../ext_server ext_server) # ollama
--- 
-2.39.3 (Apple Git-146)
-
diff --git a/llm/patches/0001-load-progress.patch b/llm/patches/0001-load-progress.patch
deleted file mode 100644
index 5d190068..00000000
--- a/llm/patches/0001-load-progress.patch
+++ /dev/null
@@ -1,44 +0,0 @@
-From c97ed60c3369294d5551ba099a88ddc509687df1 Mon Sep 17 00:00:00 2001
-From: Gabe Goodhart <ghart@us.ibm.com>
-Date: Thu, 19 Sep 2024 16:55:15 -0600
-Subject: [PATCH] patch load progress
-
----
- common/common.cpp | 2 ++
- common/common.h   | 7 +++++++
- 2 files changed, 9 insertions(+)
-
-diff --git a/common/common.cpp b/common/common.cpp
-index 8d0ed4f9..a09e8a53 100644
---- a/common/common.cpp
-+++ b/common/common.cpp
-@@ -955,6 +955,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
-     mparams.use_mmap        = params.use_mmap;
-     mparams.use_mlock       = params.use_mlock;
-     mparams.check_tensors   = params.check_tensors;
-+    mparams.progress_callback = params.progress_callback;
-+    mparams.progress_callback_user_data = params.progress_callback_user_data;
-     if (params.kv_overrides.empty()) {
-         mparams.kv_overrides = NULL;
-     } else {
-diff --git a/common/common.h b/common/common.h
-index cb87c447..818a4a4a 100644
---- a/common/common.h
-+++ b/common/common.h
-@@ -266,6 +266,13 @@ struct gpt_params {
-     std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
-     std::vector<std::string> image; // path to image file(s)
- 
-+    // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
-+    // If the provided progress_callback returns true, model loading continues.
-+    // If it returns false, model loading is immediately aborted.
-+    llama_progress_callback progress_callback = NULL;
-+    // context pointer passed to the progress callback
-+    void * progress_callback_user_data;
-+
-     // embedding
-     bool embedding         = false; // get only sentence embedding
-     int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
--- 
-2.39.3 (Apple Git-146)
-
diff --git a/llm/patches/0002-clip-log.patch b/llm/patches/0002-clip-log.patch
deleted file mode 100644
index ebcf96e7..00000000
--- a/llm/patches/0002-clip-log.patch
+++ /dev/null
@@ -1,24 +0,0 @@
-From 6fdf4268e13e56f0050fa6a29b029cbd54be49d2 Mon Sep 17 00:00:00 2001
-From: Gabe Goodhart <ghart@us.ibm.com>
-Date: Thu, 19 Sep 2024 16:58:03 -0600
-Subject: [PATCH] clip log
-
----
- examples/llava/clip.cpp | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 8aa7b075..b8941c74 100644
---- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -3,6 +3,7 @@
- // I'll gradually clean and extend it
- // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
- #include "clip.h"
-+#include "common.h"
- #include "ggml.h"
- #include "ggml-alloc.h"
- #include "ggml-backend.h"
--- 
-2.39.3 (Apple Git-146)
-
diff --git a/llm/patches/0003-load_exception.patch b/llm/patches/0003-load_exception.patch
deleted file mode 100644
index bb921ddc..00000000
--- a/llm/patches/0003-load_exception.patch
+++ /dev/null
@@ -1,57 +0,0 @@
-From 4f2b9cd0f012c49f40d0784454864ad41ca418b2 Mon Sep 17 00:00:00 2001
-From: Gabe Goodhart <ghart@us.ibm.com>
-Date: Thu, 19 Sep 2024 17:00:28 -0600
-Subject: [PATCH] load exception
-
----
- src/llama.cpp | 25 ++++++++++++++++---------
- 1 file changed, 16 insertions(+), 9 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index af8afd84..4d1db3d5 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -8871,7 +8871,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
-         }
-     } catch (const std::exception & err) {
-         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
--        return -1;
-+        throw;
-     }
- 
-     // loading time will be recalculate after the first eval, so
-@@ -18675,16 +18675,23 @@ struct llama_model * llama_load_model_from_file(
-         }
-         model->rpc_servers.push_back(servers);
-     }
--    int status = llama_model_load(path_model, *model, params);
--    GGML_ASSERT(status <= 0);
--    if (status < 0) {
--        if (status == -1) {
--            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
--        } else if (status == -2) {
--            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
-+
-+    try {
-+        int status = llama_model_load(path_model, *model, params);
-+        GGML_ASSERT(status <= 0);
-+        if (status < 0) {
-+            if (status == -1) {
-+                LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
-+            } else if (status == -2) {
-+                LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
-+            }
-+            delete model;
-+            return nullptr;
-         }
-+    } catch (...) {
-+        LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
-         delete model;
--        return nullptr;
-+        throw;
-     }
- 
-     return model;
--- 
-2.39.3 (Apple Git-146)
-
diff --git a/llm/patches/0004-metal.patch b/llm/patches/0004-metal.patch
deleted file mode 100644
index 36421b48..00000000
--- a/llm/patches/0004-metal.patch
+++ /dev/null
@@ -1,57 +0,0 @@
-From 91d3f886f1645b38d9658c0e125603e8d5338146 Mon Sep 17 00:00:00 2001
-From: nobody <>
-Date: Tue, 1 Oct 2024 13:55:01 -0600
-Subject: [PATCH] metal
-
----
- ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
- 1 file changed, 13 insertions(+), 17 deletions(-)
-
-diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
-index 9da08fe2..3a433703 100644
---- a/ggml/src/ggml-metal.m
-+++ b/ggml/src/ggml-metal.m
-@@ -1720,27 +1720,23 @@ static void ggml_metal_encode_node(
-                 // to the matrix-vector kernel
-                 int ne11_mm_min = 1;
- 
--#if 0
-                 // the numbers below are measured on M2 Ultra for 7B and 13B models
-                 // these numbers do not translate to other devices or model sizes
-                 // TODO: need to find a better approach
--                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
--                            switch (src0t) {
--                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
--                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
--                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
--                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
--                                case GGML_TYPE_Q4_0:
--                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
--                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
--                                case GGML_TYPE_Q5_0:                          // not tested yet
--                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
--                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
--                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
--                                default:             ne11_mm_min = 1;  break;
--                            }
-+                        switch (src0t) {
-+                            case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-+                            case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-+                            case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q4_0:
-+                            case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-+                            case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-+                            case GGML_TYPE_Q5_0:                          // not tested yet
-+                            case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-+                            case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-+                            case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-+                            default:             ne11_mm_min = 1;  break;
-                         }
--#endif
- 
-                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
--- 
-2.39.3 (Apple Git-146)
-
diff --git a/llm/patches/0005-default-pretokenizer.patch b/llm/patches/0005-default-pretokenizer.patch
deleted file mode 100644
index f8f87011..00000000
--- a/llm/patches/0005-default-pretokenizer.patch
+++ /dev/null
@@ -1,44 +0,0 @@
-From 0e531d69786c4a96a3a2bcf7b2d576bd6f7edf25 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:13 -0700
-Subject: [PATCH] 05-default-pretokenizer.diff
-
----
- src/llama.cpp | 14 +++-----------
- 1 file changed, 3 insertions(+), 11 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 4c0a1bb6..800dfb95 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -6287,16 +6287,7 @@ static void llm_load_vocab(
-         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
-             vocab.tokenizer_add_space_prefix = false;
-             vocab.tokenizer_clean_spaces = true;
--            if (tokenizer_pre.empty()) {
--                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
--                LLAMA_LOG_WARN("%s:                                             \n", __func__);
--                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
--                LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
--                LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
--                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
--                LLAMA_LOG_WARN("%s:                                             \n", __func__);
--                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
--            } else if (tokenizer_pre == "default") {
-+            if (tokenizer_pre == "default") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-             } else if (
-                     tokenizer_pre == "llama3"   ||
-@@ -6398,7 +6389,8 @@ static void llm_load_vocab(
-                 vocab.tokenizer_add_bos = true;
-                 vocab.tokenizer_clean_spaces = false;
-             } else {
--                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
-+                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
-+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-             }
-         } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
-             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
--- 
-2.39.3 (Apple Git-146)
-
diff --git a/llm/patches/0006-embeddings.patch b/llm/patches/0006-embeddings.patch
deleted file mode 100644
index 1f2aeb08..00000000
--- a/llm/patches/0006-embeddings.patch
+++ /dev/null
@@ -1,54 +0,0 @@
-From 235b6d876a74cb09abe26985fa89ebe5bfc9f562 Mon Sep 17 00:00:00 2001
-From: Gabe Goodhart <ghart@us.ibm.com>
-Date: Thu, 19 Sep 2024 17:06:17 -0600
-Subject: [PATCH] embeddings
-
----
- src/llama.cpp | 15 +++++++++------
- 1 file changed, 9 insertions(+), 6 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 1a8e0c51..e55ec3f8 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -16516,7 +16516,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
-     const auto n_embd  = hparams.n_embd;
- 
-     // TODO: use a per-batch flag for logits presence instead
--    const bool has_logits = !cparams.embeddings;
-+    const bool has_logits =  cparams.causal_attn;
-     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
- 
-     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -16794,20 +16794,23 @@ static int llama_decode_internal(
-             // no output
-             res  = nullptr;
-             embd = nullptr;
--        } else if (cparams.embeddings) {
--            res  = nullptr; // do not extract logits for embedding case
--            embd = nullptr;
-+        }
-+
-+        if (cparams.embeddings) {
-             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-+                embd = ggml_graph_node(gf, i);
-                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
--                    embd = ggml_graph_node(gf, i);
-                     break;
-                 }
-             }
--            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
-         } else {
-             embd = nullptr; // do not extract embeddings when not needed
-             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
-         }
-+
-+        if (!cparams.causal_attn) {
-+            res = nullptr; // do not extract logits when not needed
-+        }
-         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
- 
-         ggml_backend_sched_alloc_graph(lctx.sched, gf);
--- 
-2.39.3 (Apple Git-146)
-
diff --git a/llm/patches/0007-clip-unicode.patch b/llm/patches/0007-clip-unicode.patch
deleted file mode 100644
index 792c8b5f..00000000
--- a/llm/patches/0007-clip-unicode.patch
+++ /dev/null
@@ -1,54 +0,0 @@
-From 01c42149cbdc194644a2f138598029938e0dd447 Mon Sep 17 00:00:00 2001
-From: Gabe Goodhart <ghart@us.ibm.com>
-Date: Thu, 19 Sep 2024 17:09:57 -0600
-Subject: [PATCH] clip unicode
-
----
- examples/llava/clip.cpp | 23 +++++++++++++++++++++++
- 1 file changed, 23 insertions(+)
-
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index b8941c74..3a735f17 100644
---- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -40,6 +40,14 @@
- #include <cinttypes>
- #include <limits>
- 
-+#if defined(_WIN32)
-+#define WIN32_LEAN_AND_MEAN
-+#ifndef NOMINMAX
-+    #define NOMINMAX
-+#endif
-+#include <windows.h>
-+#endif
-+
- #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
- #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
- #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-@@ -1227,7 +1235,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-             return nullptr;
-         }
- 
-+#ifdef _WIN32
-+        int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
-+        if (!wlen) {
-+            return NULL;
-+        }
-+        wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
-+        wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
-+        if (!wlen) {
-+            free(wbuf);
-+            return NULL;
-+        }
-+        auto fin = std::ifstream(wbuf, std::ios::binary);
-+        free(wbuf);
-+#else
-         auto fin = std::ifstream(fname, std::ios::binary);
-+#endif
-         if (!fin) {
-             LOG_ERR("cannot open model file for loading tensors\n");
-             clip_free(new_clip);
--- 
-2.39.3 (Apple Git-146)
-
diff --git a/llm/patches/0008-solar-pro.patch b/llm/patches/0008-solar-pro.patch
deleted file mode 100644
index b46ed9b8..00000000
--- a/llm/patches/0008-solar-pro.patch
+++ /dev/null
@@ -1,412 +0,0 @@
-From a8fe40fa7b026d2db9bb6aeecd24fcd2027110ec Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:16 -0700
-Subject: [PATCH] add solar-pro support
-
-solar-pro introduces block skip connections where blocks are connected
-to other, non-sequential blocks with a scale multiple
-
-this change adds 4 new keys to store the skip connections and one new
-tensor to store the scalar. the scalar is implemented a 1-dimensional
-tensor with 2 elements dervied from the model's bskcn_tv configuration.
-in general, the values are (bskcn_tv, 1 - bskcn_tv)
----
- src/llama.cpp | 270 +++++++++++++++++++++++++++++++++++++++++++++++---
- 1 file changed, 255 insertions(+), 15 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 4c0a1bb6..c6fc0c3f 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -217,6 +217,7 @@ enum llm_arch {
-     LLM_ARCH_GRANITE,
-     LLM_ARCH_GRANITE_MOE,
-     LLM_ARCH_CHAMELEON,
-+    LLM_ARCH_SOLAR,
-     LLM_ARCH_UNKNOWN,
- };
- 
-@@ -270,6 +271,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_GRANITE,         "granite"      },
-     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
-     { LLM_ARCH_CHAMELEON,       "chameleon"    },
-+    { LLM_ARCH_SOLAR,           "solar"        },
-     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
- };
- 
-@@ -327,6 +329,7 @@ enum llm_kv {
-     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
-     LLM_KV_ATTENTION_SLIDING_WINDOW,
-     LLM_KV_ATTENTION_SCALE,
-+    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
- 
-     LLM_KV_ROPE_DIMENSION_COUNT,
-     LLM_KV_ROPE_FREQ_BASE,
-@@ -421,20 +424,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
-     { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
- 
--    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
--    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
--    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"         },
--    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"              },
--    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"             },
--    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
--    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
--    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
--    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
--    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
--    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
--    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
--    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
--    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
-+    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"               },
-+    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"            },
-+    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"           },
-+    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"                },
-+    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"               },
-+    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"             },
-+    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"       },
-+    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon"   },
-+    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                   },
-+    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"              },
-+    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"             },
-+    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count"   },
-+    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"           },
-+    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                    },
-+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
- 
-     { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
-     { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
-@@ -608,6 +612,7 @@ enum llm_tensor {
-     LLM_TENSOR_ENC_OUTPUT_NORM,
-     LLM_TENSOR_CLS,
-     LLM_TENSOR_CLS_OUT,
-+    LLM_TENSOR_BSKCN_TV,
- };
- 
- static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
-@@ -1527,6 +1532,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
-             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
-         },
-     },
-+
-+    {
-+        LLM_ARCH_SOLAR,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-+            { LLM_TENSOR_OUTPUT,          "output" },
-+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-+            { LLM_TENSOR_BSKCN_TV,        "bskcn_tv" },
-+        },
-+    },
-     {
-         LLM_ARCH_UNKNOWN,
-         {
-@@ -2360,6 +2384,7 @@ enum e_model {
-     MODEL_15B,
-     MODEL_16B,
-     MODEL_20B,
-+    MODEL_22B,
-     MODEL_30B,
-     MODEL_34B,
-     MODEL_35B,
-@@ -2409,6 +2434,8 @@ struct llama_hparams {
-     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
-     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
- 
-+    std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
-+
-     uint32_t n_layer_dense_lead = 0;
-     uint32_t n_lora_q = 0;
-     uint32_t n_lora_kv = 0;
-@@ -2479,6 +2506,7 @@ struct llama_hparams {
-         if (this->n_head_arr    != other.n_head_arr)    return true;
-         if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
-         if (this->n_ff_arr      != other.n_ff_arr)      return true;
-+        if (this->n_bskcn_arr   != other.n_bskcn_arr)   return true;
- 
-         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
-         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-@@ -2588,6 +2616,14 @@ struct llama_hparams {
-             return ssm_d_state * ssm_d_inner;
-         }
-     }
-+
-+    bool n_bskcn(uint32_t n, uint32_t il = 0) const {
-+        if (il < n_layer) {
-+            return n_bskcn_arr[n][il] > 0;
-+        }
-+
-+        GGML_ABORT("fatal error");
-+    }
- };
- 
- static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-@@ -2769,6 +2805,8 @@ struct llama_layer {
-     struct ggml_tensor * ffn_gate_scale;
-     struct ggml_tensor * ffn_up_scale;
-     struct ggml_tensor * ffn_down_scale;
-+
-+    struct ggml_tensor * bskcn_tv;
- };
- 
- // very similar to llama_batch,
-@@ -6134,6 +6172,21 @@ static void llm_load_hparams(
-                     default: model.type = e_model::MODEL_UNKNOWN;
-                }
-             } break;
-+        case LLM_ARCH_SOLAR:
-+            {
-+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-+
-+                for (int i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
-+                    auto & bskcn = hparams.n_bskcn_arr.at(i);
-+                    bskcn.fill(0);
-+                    ml.get_key_or_arr(::format(LLM_KV_NAMES.at(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION), LLM_ARCH_NAMES.at(ml.llm_kv.arch), i), bskcn, hparams.n_layer, false);
-+                }
-+
-+                switch (hparams.n_layer) {
-+                    case 64: model.type = e_model::MODEL_22B; break;
-+                    default: model.type = e_model::MODEL_UNKNOWN;
-+                }
-+            }
-         default: (void)0;
-     }
- 
-@@ -8839,6 +8892,37 @@ static bool llm_load_tensors(
- 
-                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- 
-+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-+                    }
-+                } break;
-+            case LLM_ARCH_SOLAR:
-+                {
-+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-+
-+                    // output
-+                    {
-+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-+                    }
-+
-+                    for (int i = 0; i < n_layer; ++i) {
-+                        ggml_context * ctx_layer = ctx_for_layer(i);
-+                        ggml_context * ctx_split = ctx_for_layer_split(i);
-+
-+                        auto & layer = model.layers[i];
-+
-+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
-+
-+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-+
-+                        layer.bskcn_tv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+
-                         layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
-@@ -16009,7 +16093,6 @@ struct llm_build_context {
- 
-         return gf;
-     }
--
-     // ref: https://github.com/facebookresearch/chameleon
-     // based on the original build_llama() function, changes:
-     //   * qk-norm
-@@ -16187,6 +16270,158 @@ struct llm_build_context {
- 
-         return gf;
-     }
-+
-+    ggml_cgraph * build_solar() {
-+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
-+
-+        // mutable variable, needed during the last layer of the computation to skip unused tokens
-+        int32_t n_tokens = this->n_tokens;
-+
-+        const int64_t n_embd_head = hparams.n_embd_head_v;
-+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-+        GGML_ASSERT(n_embd_head == hparams.n_rot);
-+
-+        struct ggml_tensor * cur;
-+        struct ggml_tensor * inpL;
-+
-+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
-+
-+        // inp_pos - contains the positions
-+        struct ggml_tensor * inp_pos = build_inp_pos();
-+
-+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
-+
-+        struct ggml_tensor * bskcn_1;
-+        struct ggml_tensor * bskcn_2;
-+
-+        for (int il = 0; il < n_layer; ++il) {
-+            struct ggml_tensor * inpSA = inpL;
-+
-+            if (hparams.n_bskcn(0, il)) {
-+                bskcn_1 = inpSA;
-+            }
-+
-+            if (hparams.n_bskcn(1, il)) {
-+                bskcn_2 = inpSA;
-+            }
-+
-+            if (hparams.n_bskcn(2, il)) {
-+                inpSA = ggml_add(
-+                   ctx0,
-+                   ggml_mul(ctx0, bskcn_1, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
-+                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
-+            }
-+
-+            if (hparams.n_bskcn(3, il)) {
-+                inpSA = ggml_add(
-+                   ctx0,
-+                   ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
-+                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
-+            }
-+
-+            // norm
-+            cur = llm_build_norm(ctx0, inpL, hparams,
-+                    model.layers[il].attn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
-+            cb(cur, "attn_norm", il);
-+
-+            // self-attention
-+            {
-+                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                struct ggml_tensor * rope_factors = build_rope_factors(il);
-+
-+                // compute Q and K and RoPE them
-+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-+                cb(Qcur, "Qcur", il);
-+                if (model.layers[il].bq) {
-+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-+                    cb(Qcur, "Qcur", il);
-+                }
-+
-+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
-+                cb(Kcur, "Kcur", il);
-+                if (model.layers[il].bk) {
-+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-+                    cb(Kcur, "Kcur", il);
-+                }
-+
-+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
-+                cb(Vcur, "Vcur", il);
-+                if (model.layers[il].bv) {
-+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-+                    cb(Vcur, "Vcur", il);
-+                }
-+
-+                Qcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
-+                cb(Qcur, "Qcur", il);
-+
-+                Kcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
-+                cb(Kcur, "Kcur", il);
-+
-+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-+                        model.layers[il].wo, model.layers[il].bo,
-+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-+            }
-+
-+            if (il == n_layer - 1) {
-+                // skip computing output for unused tokens
-+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-+                n_tokens = n_outputs;
-+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-+            }
-+
-+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-+            cb(ffn_inp, "ffn_inp", il);
-+
-+            // feed-forward network
-+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
-+                    model.layers[il].ffn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
-+            cb(cur, "ffn_norm", il);
-+
-+            cur = llm_build_ffn(ctx0, lctx, cur,
-+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-+                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-+                    NULL,
-+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-+            cb(cur, "ffn_out", il);
-+
-+            cur = ggml_add(ctx0, cur, ffn_inp);
-+            cb(cur, "ffn_out", il);
-+
-+            cur = lctx.cvec.apply_to(ctx0, cur, il);
-+            cb(cur, "l_out", il);
-+
-+            // input for next layer
-+            inpL = cur;
-+        }
-+
-+        cur = inpL;
-+
-+        cur = llm_build_norm(ctx0, cur, hparams,
-+                model.output_norm, NULL,
-+                LLM_NORM_RMS, cb, -1);
-+        cb(cur, "result_norm", -1);
-+
-+        // lm_head
-+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-+        cb(cur, "result_output", -1);
-+
-+        ggml_build_forward_expand(gf, cur);
-+
-+        return gf;
-+    }
- };
- 
- static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-@@ -16451,6 +16686,10 @@ static struct ggml_cgraph * llama_build_graph(
-             {
-                 result = llm.build_chameleon();
-             } break;
-+        case LLM_ARCH_SOLAR:
-+            {
-+                result = llm.build_solar();
-+            } break;
-         default:
-             GGML_ABORT("fatal error");
-     }
-@@ -19594,6 +19833,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_GRANITE:
-         case LLM_ARCH_GRANITE_MOE:
-         case LLM_ARCH_CHAMELEON:
-+        case LLM_ARCH_SOLAR:
-             return LLAMA_ROPE_TYPE_NORM;
- 
-         // the pairs of head values are offset by n_rot/2
--- 
-2.39.3 (Apple Git-146)
-
diff --git a/scripts/build_darwin.sh b/scripts/build_darwin.sh
index 4e914d0f..f00cbe84 100755
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -7,15 +7,9 @@ set -e
 mkdir -p dist
 
 for TARGETARCH in arm64 amd64; do
-    if [ -n "${OLLAMA_NEW_RUNNERS}" ]; then
-        echo "Building Go runner darwin $TARGETARCH"
-        rm -rf llama/build
-        GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
-    else
-        echo "Building C++ runner darwin $TARGETARCH"
-        rm -rf llm/build
-        GOOS=darwin GOARCH=$TARGETARCH go generate ./...
-    fi
+    echo "Building Go runner darwin $TARGETARCH"
+    rm -rf llama/build
+    GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
     # These require Xcode v13 or older to target MacOS v11
     # If installed to an alternate location use the following to enable
     # export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh
index ea747a3c..567eb7c7 100755
--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
@@ -19,7 +19,7 @@ docker buildx build \
     ${LOAD_OR_PUSH} \
     --platform=${PLATFORM} \
     ${OLLAMA_COMMON_BUILD_ARGS} \
-    -f ${DOCKERFILE_DIR}Dockerfile \
+    -f Dockerfile \
     -t ${FINAL_IMAGE_REPO}:$VERSION \
     .
 
@@ -29,7 +29,7 @@ if echo $PLATFORM | grep "amd64" > /dev/null; then
         --platform=linux/amd64 \
         ${OLLAMA_COMMON_BUILD_ARGS} \
         --target runtime-rocm \
-        -f ${DOCKERFILE_DIR}Dockerfile \
+        -f Dockerfile \
         -t ${FINAL_IMAGE_REPO}:$VERSION-rocm \
         .
 fi
\ No newline at end of file
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index 48fdd37b..894d9dd2 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -19,7 +19,7 @@ docker buildx build \
         --platform=${PLATFORM} \
         ${OLLAMA_COMMON_BUILD_ARGS} \
         --target dist \
-        -f ${DOCKERFILE_DIR}Dockerfile \
+        -f Dockerfile \
         .
 
 # buildx behavior changes for single vs. multiplatform
diff --git a/scripts/build_remote.py b/scripts/build_remote.py
deleted file mode 100755
index 2ab58ad7..00000000
--- a/scripts/build_remote.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/env python3
-import subprocess
-import sys
-from urllib.parse import urlparse
-from git import Repo
-
-# Helper script to be able to build on remote repos using git to push local changes
-# (e.g. particularly helpful to target a remote windows build system)
-#
-# Typical windows remote git config looks like this:
-#
-#[remote "windows-pa"]
-#        url = jdoe@desktop-foo:C:/Users/Jdoe/code/ollama
-#        fetch = +refs/heads/*:refs/remotes/windows-pa/*
-#        uploadpack = powershell git upload-pack
-#        receivepack = powershell git receive-pack
-#
-
-# TODO - add argpare and make this more configurable 
-# - force flag becomes optional
-# - generate, build or test ...
-
-# Note: remote repo will need this run once:
-# git config --local receive.denyCurrentBranch updateInstead
-repo = Repo(".")
-
-# On linux, add links in /usr/local/bin to the go binaries to avoid needing this
-# GoCmd = "/usr/local/go/bin/go" 
-GoCmd = "go" 
-
-if repo.is_dirty():
-    print("Tree is dirty.  Commit your changes before running this script")
-    sys.exit(1)
-
-if len(sys.argv) != 2:
-    print("Please specify the remote name: " + ', '.join([r.name for r in repo.remotes]))
-    sys.exit(1)
-remote_name = sys.argv[1]
-
-remote = {r.name: r for r in repo.remotes}[remote_name]
-raw_url = list(remote.urls)[0]
-url = urlparse(raw_url)
-# Windows urls don't quite parse properly
-if url.scheme == "" and url.netloc == "":
-    url = urlparse("ssh://" + raw_url)
-print("URL: " + str(url))
-netloc = url.netloc.split(":")[0]
-path = url.path
-branch_name = repo.active_branch.name
-
-print("Force pushing content to remote...")
-# Use with care given the force push
-remote.push(force=True).raise_if_error()
-
-print("Ensuring correct branch checked out on remote via ssh...")
-subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'git', 'checkout', branch_name])
-
-
-# TODO - add some hardening to try to figure out how to set up the path properly
-# subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'env'])
-# TODO - or consider paramiko maybe
-
-print("Running Windows Build Script")
-subprocess.check_call(['ssh', netloc, 'cd', path, ';', "powershell", "-ExecutionPolicy", "Bypass", "-File", "./scripts/build_windows.ps1"])
-
-# print("Building")
-# subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.'])
-
-print("Copying built result")
-subprocess.check_call(['scp', netloc +":"+ path + "/ollama.exe",  './dist/'])
-
-print("Copying installer")
-subprocess.check_call(['scp', netloc +":"+ path + "/dist/Ollama Setup.exe",  './dist/'])
-
-
-
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index c29ba20e..9cf363f3 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -83,51 +83,8 @@ function buildOllama() {
     if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
         write-host "Building ollama runners"
         Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
-        if ($null -eq ${env:OLLAMA_NEW_RUNNERS}) {
-            # Start by skipping CUDA to build everything else
-            write-host "Building ollama runners"
-            powershell -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
-
-            # Then skip everyhting else and build all the CUDA variants
-            foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
-                write-host "Building CUDA ${env:CUDA_LIB_DIR} runner"
-
-                if ($env:CUDA_LIB_DIR.Contains("v12")) {
-                    powershell -Command {
-                        $env:OLLAMA_SKIP_CUDA_GENERATE=""
-                        $env:OLLAMA_SKIP_STATIC_GENERATE="1"
-                        $env:OLLAMA_SKIP_CPU_GENERATE="1"
-                        $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
-                        $env:OLLAMA_SKIP_ROCM_GENERATE="1"
-                        $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
-                        $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on"
-                        $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
-                        $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
-                        & go generate ./...
-                    }
-                } else {
-                    powershell -Command {
-                        $env:OLLAMA_SKIP_CUDA_GENERATE=""
-                        $env:OLLAMA_SKIP_STATIC_GENERATE="1"
-                        $env:OLLAMA_SKIP_CPU_GENERATE="1"
-                        $env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
-                        $env:OLLAMA_SKIP_ROCM_GENERATE="1"
-                        $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
-                        $env:OLLAMA_CUSTOM_CUDA_DEFS=""
-                        $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
-                        $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
-                        & go generate ./...
-                    }
-                }
-                if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            }
-        } else {
-            & make -C llama -j 12
-            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        }
-        
-        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
+        & make -C llama -j 12
+        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     } else {
         write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
     }
@@ -172,7 +129,7 @@ function gatherDependencies() {
     } else {
         $depArch=$script:TARGET_ARCH
     }
-    if ($depArch -eq "amd64") {
+    if ($depArch -eq "x64") {
         cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DIST_DIR}\lib\ollama\"
         cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DIST_DIR}\lib\ollama\"
         cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DIST_DIR}\lib\ollama\"
diff --git a/scripts/env.sh b/scripts/env.sh
index 0ccbac59..22b4ee4e 100644
--- a/scripts/env.sh
+++ b/scripts/env.sh
@@ -20,12 +20,6 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
     --build-arg=CUSTOM_CPU_FLAGS \
     --build-arg=GPU_RUNNER_CPU_FLAGS \
     --build-arg=AMDGPU_TARGETS"
-OLLAMA_NEW_RUNNERS=${OLLAMA_NEW_RUNNERS:-""}
-if [ -n "${OLLAMA_NEW_RUNNERS}" ]; then
-    DOCKERFILE_DIR="./llama/"
-else
-    DOCKERFILE_DIR="./"
-fi
 
 echo "Building Ollama"
 echo "VERSION=$VERSION"