Switch back to subprocessing for llama.cpp

This should resolve a number of memory leak and stability defects by allowing us to isolate llama.cpp in a separate process and shutdown when idle, and gracefully restart if it has problems. This also serves as a first step to be able to run multiple copies to support multiple models concurrently.
2024-03-14 10:24:13 -07:00 · 2024-03-14 10:24:13 -07:00 · 58d95cc9bd
commit 58d95cc9bd
parent 3b6a9154dd
35 changed files with 1416 additions and 1910 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -56,10 +56,12 @@ jobs:
      - run: go get ./...
      - run: |
          $gopath=(get-command go).source | split-path -parent
          $gccpath=(get-command gcc).source | split-path -parent
          & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
          cd $env:GITHUB_WORKSPACE
          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$env:PATH"
+          $env:PATH="$gopath;$gccpath;$env:PATH"
          echo $env:PATH
          go generate -x ./...
        if: ${{ startsWith(matrix.os, 'windows-') }}
        name: "Windows Go Generate"
@ -69,7 +71,9 @@ jobs:
      - uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
-          path: llm/llama.cpp/build/**/lib/*
+          path: |
            llm/build/**/bin/*
            llm/build/**/*.a
  generate-cuda:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
@ -100,7 +104,7 @@ jobs:
      - uses: actions/upload-artifact@v4
        with:
          name: cuda-${{ matrix.cuda-version }}-libraries
-          path: llm/llama.cpp/build/**/lib/*
+          path: llm/build/**/bin/*
  generate-rocm:
    needs: [changes]
    if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
@ -131,7 +135,7 @@ jobs:
      - uses: actions/upload-artifact@v4
        with:
          name: rocm-${{ matrix.rocm-version }}-libraries
-          path: llm/llama.cpp/build/**/lib/*
+          path: llm/build/**/lib/*
  # ROCm generation step
  generate-windows-rocm:
@ -244,17 +248,17 @@ jobs:
          esac >>$GITHUB_ENV
        shell: bash
      - run: |
-          mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/
+          mkdir -p llm/build/linux/$ARCH/stub/bin/
-          touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so
+          touch llm/build/linux/$ARCH/stub/bin/stub.so
        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
      - run: |
-          mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/
+          mkdir -p llm/build/darwin/$ARCH/stub/bin/
-          touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib
+          touch llm/build/darwin/$ARCH/stub/bin/stub.dylib
-          touch llm/llama.cpp/ggml-metal.metal
+          touch llm/ggml-metal.metal
        if: ${{ startsWith(matrix.os, 'macos-') }}
      - run: |
-          mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/
+          mkdir -p llm/build/windows/$ARCH/stub/stub/bin/
-          touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll
+          touch llm/build/windows/$ARCH/stub/stub/bin/stub.dll
        if: ${{ startsWith(matrix.os, 'windows-') }}
      - uses: golangci/golangci-lint-action@v3
  test:
@ -271,6 +275,7 @@ jobs:
    env:
      GOARCH: ${{ matrix.arch }}
      CGO_ENABLED: '1'
      OLLAMA_CPU_TARGET: "static"
    steps:
      - uses: actions/checkout@v4
        with:
@ -287,18 +292,19 @@ jobs:
          esac >>$GITHUB_ENV
        shell: bash
      - run: |
-          mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/
+          mkdir -p llm/build/linux/$ARCH/stub/bin/
-          touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so
+          touch llm//build/linux/$ARCH/stub/bin/stub.so
        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
      - run: |
-          mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/
+          mkdir -p llm/build/darwin/$ARCH/stub/bin/
-          touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib
+          touch llm/build/darwin/$ARCH/stub/bin/stub.dylib
-          touch llm/llama.cpp/ggml-metal.metal
+          touch llm/ggml-metal.metal
        if: ${{ startsWith(matrix.os, 'macos-') }}
      - run: |
-          mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/
+          mkdir -p llm/build/windows/$ARCH/stub/stub/bin/
-          touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll
+          touch llm/build/windows/$ARCH/stub/stub/bin/stub.dll
        if: ${{ startsWith(matrix.os, 'windows-') }}
      - run: go generate ./...
      - run: go build
      - run: go test -v ./...
      - uses: actions/upload-artifact@v4
--- a/.gitignore
+++ b/.gitignore
@ -11,3 +11,4 @@ ggml-metal.metal
 .idea
 test_data
 *.crt
 llm/build
--- a/25
+++ b/25
@ -61,6 +61,8 @@ ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
 RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
 RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
@ -68,28 +70,33 @@ RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
 RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
-FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
+FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64
 ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 # Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
 ARG OLLAMA_CUSTOM_CPU_DEFS
 ARG CGO_CFLAGS
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
 RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
 FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
 RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
 # Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
 ARG GOFLAGS
 ARG CGO_CFLAGS
@ -101,8 +108,8 @@ ENV CGO_ENABLED 1
 ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
-COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
+COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-RUN mkdir -p /go/src/github.com/ollama/ollama/dist/deps/
+COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
 ARG CGO_CFLAGS
 RUN go build -trimpath .
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@ -9,6 +9,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
 	"syscall"
 	"time"
 	"github.com/ollama/ollama/api"
@ -83,6 +84,28 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
 		io.Copy(logFile, stderr) //nolint:errcheck
 	}()
 	// Re-wire context done behavior to attempt a graceful shutdown of the server
 	cmd.Cancel = func() error {
 		if cmd.Process != nil {
 			cmd.Process.Signal(os.Interrupt) //nolint:errcheck
 			tick := time.NewTicker(10 * time.Millisecond)
 			defer tick.Stop()
 			for {
 				select {
 				case <-tick.C:
 					// OS agnostic "is it still running"
 					if proc, err := os.FindProcess(int(cmd.Process.Pid)); err != nil || errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
 						return nil //nolint:nilerr
 					}
 				case <-time.After(5 * time.Second):
 					slog.Warn("graceful server shutdown timeout, killing", "pid", cmd.Process.Pid)
 					cmd.Process.Kill() //nolint:errcheck
 				}
 			}
 		}
 		return nil
 	}
 	// run the command and wait for it to finish
 	if err := cmd.Start(); err != nil {
 		return done, fmt.Errorf("failed to start server %w", err)
@ -105,7 +128,7 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
 			select {
 			case <-ctx.Done():
-				slog.Debug(fmt.Sprintf("server shutdown with exit code %d", code))
+				slog.Info(fmt.Sprintf("server shutdown with exit code %d", code))
 				done <- code
 				return
 			default:
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@ -100,6 +100,8 @@ func AMDGetGPUInfo(resp *GpuInfo) {
 		return
 	}
 	updateLibPath(libDir)
 	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
 	if gfxOverride == "" {
 		supported, err := GetSupportedGFX(libDir)
@ -143,6 +145,21 @@ func AMDGetGPUInfo(resp *GpuInfo) {
 	}
 }
 func updateLibPath(libDir string) {
 	ldPaths := []string{}
 	if val, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
 		ldPaths = strings.Split(val, ":")
 	}
 	for _, d := range ldPaths {
 		if d == libDir {
 			return
 		}
 	}
 	val := strings.Join(append(ldPaths, libDir), ":")
 	slog.Debug("updated lib path", "LD_LIBRARY_PATH", val)
 	os.Setenv("LD_LIBRARY_PATH", val)
 }
 // Walk the sysfs nodes for the available GPUs and gather information from them
 // skipping over any devices in the skip map
 func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
--- a/gpu/assets.go
+++ b/gpu/assets.go
@ -11,6 +11,7 @@ import (
 	"strings"
 	"sync"
 	"syscall"
 	"time"
 )
 var (
@ -83,11 +84,16 @@ func Cleanup() {
 		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
 		slog.Debug("cleaning up", "dir", tmpDir)
 		err := os.RemoveAll(tmpDir)
 		if err != nil {
 			// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
 			time.Sleep(1000 * time.Millisecond)
 			err = os.RemoveAll(tmpDir)
 			if err != nil {
 				slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
 			}
 		}
 	}
 }
 func UpdatePath(dir string) {
 	if runtime.GOOS == "windows" {
--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@ -1,142 +0,0 @@
 #include "dyn_ext_server.h"
 #include <stdio.h>
 #include <string.h>
 #ifdef __linux__
 #include <dlfcn.h>
 #define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
 #define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
 #define LOAD_ERR() strdup(dlerror())
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
 #elif _WIN32
 #include <windows.h>
 #define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
 #define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
 #define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
 #define LOAD_ERR() ({\
  LPSTR messageBuffer = NULL; \
  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
                                 NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
  char *resp = strdup(messageBuffer); \
  LocalFree(messageBuffer); \
  resp; \
 })
 #else
 #include <dlfcn.h>
 #define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
 #define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
 #define LOAD_ERR() strdup(dlerror())
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
 #endif
 void dyn_init(const char *libPath, struct dynamic_llama_server *s,
                       ext_server_resp_t *err) {
  int i = 0;
  struct lookup {
    char *s;
    void **p;
  } l[] = {
      {"llama_server_init", (void *)&s->llama_server_init},
      {"llama_server_start", (void *)&s->llama_server_start},
      {"llama_server_stop", (void *)&s->llama_server_stop},
      {"llama_server_completion", (void *)&s->llama_server_completion},
      {"llama_server_completion_next_result",
       (void *)&s->llama_server_completion_next_result},
      {"llama_server_completion_cancel",
       (void *)&s->llama_server_completion_cancel},
      {"llama_server_release_task_result",
       (void *)&s->llama_server_release_task_result},
      {"llama_server_tokenize", (void *)&s->llama_server_tokenize},
      {"llama_server_detokenize", (void *)&s->llama_server_detokenize},
      {"llama_server_embedding", (void *)&s->llama_server_embedding},
      {"llama_server_release_json_resp",
       (void *)&s->llama_server_release_json_resp},
      {"", NULL},
  };
  printf("loading library %s\n", libPath);
  s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
  if (!s->handle) {
    err->id = -1;
    char *msg = LOAD_ERR();
    snprintf(err->msg, err->msg_len,
             "Unable to load dynamic server library: %s", msg);
    free(msg);
    return;
  }
  for (i = 0; l[i].p != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(s->handle);
      err->id = -1;
      char *msg = LOAD_ERR();
      snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
               l[i].s, msg);
      free(msg);
      return;
    }
  }
 }
 inline void dyn_llama_server_init(struct dynamic_llama_server s,
                                           ext_server_params_t *sparams,
                                           ext_server_resp_t *err) {
  s.llama_server_init(sparams, err);
 }
 inline void dyn_llama_server_start(struct dynamic_llama_server s) {
  s.llama_server_start();
 }
 inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
  s.llama_server_stop();
 }
 inline void dyn_llama_server_completion(struct dynamic_llama_server s,
                                                 const char *json_req,
                                                 ext_server_resp_t *resp) {
  s.llama_server_completion(json_req, resp);
 }
 inline void dyn_llama_server_completion_next_result(
    struct dynamic_llama_server s, const int task_id,
    ext_server_task_result_t *result) {
  s.llama_server_completion_next_result(task_id, result);
 }
 inline void dyn_llama_server_completion_cancel(
    struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
  s.llama_server_completion_cancel(task_id, err);
 }
 inline void dyn_llama_server_release_task_result(
    struct dynamic_llama_server s, ext_server_task_result_t *result) {
  s.llama_server_release_task_result(result);
 }
 inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
                                               const char *json_req,
                                               char **json_resp,
                                               ext_server_resp_t *err) {
  s.llama_server_tokenize(json_req, json_resp, err);
 }
 inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
                                                 const char *json_req,
                                                 char **json_resp,
                                                 ext_server_resp_t *err) {
  s.llama_server_detokenize(json_req, json_resp, err);
 }
 inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
                                                const char *json_req,
                                                char **json_resp,
                                                ext_server_resp_t *err) {
  s.llama_server_embedding(json_req, json_resp, err);
 }
 inline void dyn_llama_server_release_json_resp(
    struct dynamic_llama_server s, char **json_resp) {
  s.llama_server_release_json_resp(json_resp);
 }
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@ -1,388 +0,0 @@
 package llm
 /*
 #cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
 #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 #cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
 #cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
 #cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
 #cgo darwin CPPFLAGS:  -DGGML_USE_ACCELERATE
 #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 #cgo darwin LDFLAGS: -lc++ -framework Accelerate
 #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 #cgo linux CFLAGS: -D_GNU_SOURCE
 #cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
 #cgo linux windows LDFLAGS: -lpthread
 #include <stdlib.h>
 #include "dyn_ext_server.h"
 */
 import "C"
 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"
 	"time"
 	"unsafe"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/gpu"
 )
 type dynExtServer struct {
 	s       C.struct_dynamic_llama_server
 	options *api.Options
 }
 // Note: current implementation does not support concurrent instantiations
 var mutex sync.Mutex
 func newExtServerResp(len C.size_t) C.ext_server_resp_t {
 	var resp C.ext_server_resp_t
 	resp.msg_len = len
 	bytes := make([]byte, len)
 	resp.msg = (*C.char)(C.CBytes(bytes))
 	return resp
 }
 func freeExtServerResp(resp C.ext_server_resp_t) {
 	if resp.msg_len == 0 {
 		return
 	}
 	C.free(unsafe.Pointer(resp.msg))
 }
 func extServerResponseToErr(resp C.ext_server_resp_t) error {
 	return fmt.Errorf(C.GoString(resp.msg))
 }
 func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
 	if !mutex.TryLock() {
 		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
 		mutex.Lock()
 	}
 	gpu.UpdatePath(filepath.Dir(library))
 	libPath := C.CString(library)
 	defer C.free(unsafe.Pointer(libPath))
 	resp := newExtServerResp(512)
 	defer freeExtServerResp(resp)
 	var srv C.struct_dynamic_llama_server
 	C.dyn_init(libPath, &srv, &resp)
 	if resp.id < 0 {
 		mutex.Unlock()
 		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
 	}
 	llm := dynExtServer{
 		s:       srv,
 		options: opts,
 	}
 	slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
 	var sparams C.ext_server_params_t
 	sparams.model = C.CString(model)
 	defer C.free(unsafe.Pointer(sparams.model))
 	sparams.embedding = true
 	sparams.n_ctx = C.uint(opts.NumCtx)
 	sparams.n_batch = C.uint(opts.NumBatch)
 	sparams.n_gpu_layers = C.int(opts.NumGPU)
 	sparams.main_gpu = C.int(opts.MainGPU)
 	sparams.n_parallel = 1 // TODO - wire up concurrency
 	// Always use the value encoded in the model
 	sparams.rope_freq_base = 0.0
 	sparams.rope_freq_scale = 0.0
 	sparams.memory_f16 = C.bool(opts.F16KV)
 	sparams.use_mlock = C.bool(opts.UseMLock)
 	sparams.use_mmap = C.bool(opts.UseMMap)
 	if opts.UseNUMA {
 		sparams.numa = C.int(1)
 	} else {
 		sparams.numa = C.int(0)
 	}
 	sparams.lora_adapters = nil
 	for i := 0; i < len(adapters); i++ {
 		la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
 		defer C.free(unsafe.Pointer(la))
 		la.adapter = C.CString(adapters[i])
 		defer C.free(unsafe.Pointer(la.adapter))
 		la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
 		la.next = nil
 		if i == 0 {
 			sparams.lora_adapters = la
 		} else {
 			tmp := sparams.lora_adapters
 			for ; tmp.next != nil; tmp = tmp.next {
 			}
 			tmp.next = la
 		}
 	}
 	if len(projectors) > 0 {
 		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
 		sparams.mmproj = C.CString(projectors[0])
 		defer C.free(unsafe.Pointer(sparams.mmproj))
 	} else {
 		sparams.mmproj = nil
 	}
 	sparams.n_threads = C.uint(opts.NumThread)
 	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
 		sparams.verbose_logging = C.bool(true)
 	} else {
 		sparams.verbose_logging = C.bool(false)
 	}
 	slog.Info("Initializing llama server")
 	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
 	initResp := newExtServerResp(512)
 	defer freeExtServerResp(initResp)
 	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
 	if initResp.id < 0 {
 		mutex.Unlock()
 		err := extServerResponseToErr(initResp)
 		slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
 		return nil, err
 	}
 	slog.Info("Starting llama main loop")
 	C.dyn_llama_server_start(llm.s)
 	return &llm, nil
 }
 func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	if len(predict.Images) > 0 {
 		slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
 	}
 	request := map[string]any{
 		"prompt":            predict.Prompt,
 		"stream":            true,
 		"n_predict":         predict.Options.NumPredict,
 		"n_keep":            predict.Options.NumKeep,
 		"temperature":       predict.Options.Temperature,
 		"top_k":             predict.Options.TopK,
 		"top_p":             predict.Options.TopP,
 		"tfs_z":             predict.Options.TFSZ,
 		"typical_p":         predict.Options.TypicalP,
 		"repeat_last_n":     predict.Options.RepeatLastN,
 		"repeat_penalty":    predict.Options.RepeatPenalty,
 		"presence_penalty":  predict.Options.PresencePenalty,
 		"frequency_penalty": predict.Options.FrequencyPenalty,
 		"mirostat":          predict.Options.Mirostat,
 		"mirostat_tau":      predict.Options.MirostatTau,
 		"mirostat_eta":      predict.Options.MirostatEta,
 		"penalize_nl":       predict.Options.PenalizeNewline,
 		"seed":              predict.Options.Seed,
 		"stop":              predict.Options.Stop,
 		"image_data":        predict.Images,
 		"cache_prompt":      true,
 	}
 	if predict.Format == "json" {
 		request["grammar"] = jsonGrammar
 		if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
 			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
 		}
 	}
 	retryDelay := 100 * time.Microsecond
 	for retries := 0; retries < maxRetries; retries++ {
 		if retries > 0 {
 			time.Sleep(retryDelay) // wait before retrying
 			retryDelay *= 2        // exponential backoff
 		}
 		// Handling JSON marshaling with special characters unescaped.
 		buffer := &bytes.Buffer{}
 		enc := json.NewEncoder(buffer)
 		enc.SetEscapeHTML(false)
 		if err := enc.Encode(request); err != nil {
 			return fmt.Errorf("failed to marshal data: %w", err)
 		}
 		req := C.CString(buffer.String())
 		defer C.free(unsafe.Pointer(req))
 		C.dyn_llama_server_completion(llm.s, req, &resp)
 		if resp.id < 0 {
 			return extServerResponseToErr(resp)
 		}
 		retryNeeded := false
 		// keep track of the last token generated, this is used to abort if the model starts looping
 		var lastToken string
 		var tokenRepeat int
 	out:
 		for {
 			select {
 			case <-ctx.Done():
 				return cancelCompletion(llm, resp)
 			default:
 				var result C.ext_server_task_result_t
 				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
 				json_resp := C.GoString(result.json_resp)
 				C.dyn_llama_server_release_task_result(llm.s, &result)
 				var p prediction
 				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
 					C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
 					if resp.id < 0 {
 						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
 					} else {
 						return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
 					}
 				}
 				if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
 					retryNeeded = true
 					// task will already be canceled
 					break out
 				}
 				switch {
 				case strings.TrimSpace(p.Content) == lastToken:
 					tokenRepeat++
 				default:
 					lastToken = strings.TrimSpace(p.Content)
 					tokenRepeat = 0
 				}
 				// 30 picked as an arbitrary max token repeat limit, modify as needed
 				if tokenRepeat > 30 {
 					slog.Debug("prediction aborted, token repeat limit reached")
 					return cancelCompletion(llm, resp)
 				}
 				if p.Content != "" {
 					fn(PredictResult{
 						Content: p.Content,
 					})
 				}
 				if p.Stop || bool(result.stop) {
 					fn(PredictResult{
 						Done:               true,
 						PromptEvalCount:    p.Timings.PromptN,
 						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
 						EvalCount:          p.Timings.PredictedN,
 						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
 					})
 					return nil
 				}
 			}
 		}
 		if !retryNeeded {
 			return nil // success
 		}
 	}
 	// should never reach here ideally
 	return fmt.Errorf("max retries exceeded")
 }
 func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
 	C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
 	if resp.id < 0 {
 		return extServerResponseToErr(resp)
 	} else {
 		return nil
 	}
 }
 func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: prompt})
 	if err != nil {
 		return nil, fmt.Errorf("marshaling encode data: %w", err)
 	}
 	req := C.CString(string(data))
 	defer C.free(unsafe.Pointer(req))
 	var json_resp *C.char
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
 	if resp.id < 0 {
 		return nil, extServerResponseToErr(resp)
 	}
 	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
 	var encoded TokenizeResponse
 	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
 		return nil, fmt.Errorf("unmarshal encode response: %w", err2)
 	}
 	return encoded.Tokens, err
 }
 func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
 	if len(tokens) == 0 {
 		return "", nil
 	}
 	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
 	if err != nil {
 		return "", fmt.Errorf("marshaling decode data: %w", err)
 	}
 	req := C.CString(string(data))
 	defer C.free(unsafe.Pointer(req))
 	var json_resp *C.char
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
 	if resp.id < 0 {
 		return "", extServerResponseToErr(resp)
 	}
 	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
 	var decoded DetokenizeResponse
 	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
 		return "", fmt.Errorf("unmarshal encode response: %w", err2)
 	}
 	return decoded.Content, err
 }
 func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: input})
 	if err != nil {
 		return nil, fmt.Errorf("error marshaling embed data: %w", err)
 	}
 	req := C.CString(string(data))
 	defer C.free(unsafe.Pointer(req))
 	var json_resp *C.char
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
 	if resp.id < 0 {
 		return nil, extServerResponseToErr(resp)
 	}
 	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
 	var embedding EmbeddingResponse
 	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
 		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}
 	return embedding.Embedding, nil
 }
 func (llm *dynExtServer) Close() {
 	C.dyn_llama_server_stop(llm.s)
 	mutex.Unlock()
 }
--- a/llm/dyn_ext_server.h
+++ b/llm/dyn_ext_server.h
@ -1,74 +0,0 @@
 #include <stdlib.h>
 #include "ext_server.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct dynamic_llama_server {
  void *handle;
  void (*llama_server_init)(ext_server_params_t *sparams,
                            ext_server_resp_t *err);
  void (*llama_server_start)();
  void (*llama_server_stop)();
  void (*llama_server_completion)(const char *json_req,
                                  ext_server_resp_t *resp);
  void (*llama_server_completion_next_result)(const int task_id,
                                              ext_server_task_result_t *result);
  void (*llama_server_completion_cancel)(const int task_id,
                                         ext_server_resp_t *err);
  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
                                ext_server_resp_t *err);
  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
                                  ext_server_resp_t *err);
  void (*llama_server_embedding)(const char *json_req, char **json_resp,
                                 ext_server_resp_t *err);
  void (*llama_server_release_json_resp)(char **json_resp);
 };
 void dyn_init(const char *libPath, struct dynamic_llama_server *s,
                       ext_server_resp_t *err);
 // No good way to call C function pointers from Go so inline the indirection
 void dyn_llama_server_init(struct dynamic_llama_server s,
                                    ext_server_params_t *sparams,
                                    ext_server_resp_t *err);
 void dyn_llama_server_start(struct dynamic_llama_server s);
 void dyn_llama_server_stop(struct dynamic_llama_server s);
 void dyn_llama_server_completion(struct dynamic_llama_server s,
                                          const char *json_req,
                                          ext_server_resp_t *resp);
 void dyn_llama_server_completion_next_result(
    struct dynamic_llama_server s, const int task_id,
    ext_server_task_result_t *result);
 void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
                                                 const int task_id,
                                                 ext_server_resp_t *err);
 void dyn_llama_server_release_task_result(
    struct dynamic_llama_server s, ext_server_task_result_t *result);
 void dyn_llama_server_tokenize(struct dynamic_llama_server s,
                                        const char *json_req, char **json_resp,
                                        ext_server_resp_t *err);
 void dyn_llama_server_detokenize(struct dynamic_llama_server s,
                                          const char *json_req,
                                          char **json_resp,
                                          ext_server_resp_t *err);
 void dyn_llama_server_embedding(struct dynamic_llama_server s,
                                         const char *json_req, char **json_resp,
                                         ext_server_resp_t *err);
 void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
                                                 char **json_resp);
 #ifdef __cplusplus
 }
 #endif
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@ -1,21 +1,14 @@
-set(TARGET ext_server)
+set(TARGET ollama_llama_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
-    add_library(${TARGET} SHARED ext_server.cpp ../llama.cpp/llama.cpp)
+    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 else()
    add_library(${TARGET} STATIC ext_server.cpp ../llama.cpp/llama.cpp)
 endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
 target_link_libraries(${TARGET} PRIVATE ggml llava common )
 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
 install(TARGETS ext_server LIBRARY)
 if (CUDAToolkit_FOUND)
    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
    if (WIN32)
        target_link_libraries(${TARGET} PRIVATE nvml)
    endif()
 endif()
--- a/llm/ext_server/README.md
+++ b/llm/ext_server/README.md
@ -1,18 +0,0 @@
 # Extern C Server
 This directory contains a thin facade we layer on top of the Llama.cpp server to
 expose `extern C` interfaces to access the functionality through direct API
 calls in-process.  The llama.cpp code uses compile time macros to configure GPU
 type along with other settings.  During the `go generate ./...` execution, the
 build will generate one or more copies of the llama.cpp `extern C` server based
 on what GPU libraries are detected to support multiple GPU types as well as CPU
 only support. The Ollama go build then embeds these different servers to support
 different GPUs and settings at runtime.
 If you are making changes to the code in this directory, make sure to disable
 caching during your go build to ensure you pick up your changes.  A typical
 iteration cycle from the top of the source tree looks like:
 ```
 go generate ./... && go build -a .
 ```
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@ -1,377 +0,0 @@
 #include "ext_server.h"
 #include <atomic>
 // Necessary evil since the server types are not defined in a header
 #include "server.cpp"
 // Low level API access to verify GPU access
 #if defined(GGML_USE_CUBLAS)
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
 #ifdef __HIP_PLATFORM_AMD__
 // for rocblas_initialize()
 #include "rocblas/rocblas.h"
 #endif // __HIP_PLATFORM_AMD__
 #define cudaGetDevice hipGetDevice
 #define cudaError_t hipError_t
 #define cudaSuccess hipSuccess
 #define cudaGetErrorString hipGetErrorString
 #else
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
 #endif // defined(GGML_USE_HIPBLAS)
 #endif // GGML_USE_CUBLAS
 // Expose the llama server as a callable extern "C" API
 llama_server_context *llama = NULL;
 std::thread ext_server_thread;
 bool shutting_down = false;
 std::atomic_int recv_counter;
 // RAII wrapper for tracking in-flight recv calls
 class atomicRecv {
  public:
    atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
      ++this->atomic;
    }
    ~atomicRecv() {
      --this->atomic;
    }
  private:
    std::atomic<int> &atomic;
 };
 void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
  recv_counter = 0;
  assert(err != NULL && sparams != NULL);
  log_set_target(stderr);
  if (!sparams->verbose_logging) {
    server_verbose = true;
    log_disable();
  }
  LOG_TEE("system info: %s\n", llama_print_system_info());
  err->id = 0;
  err->msg[0] = '\0';
  try {
    llama = new llama_server_context;
    gpt_params params;
    params.n_ctx = sparams->n_ctx;
    params.n_batch = sparams->n_batch;
    if (sparams->n_threads > 0) {
      params.n_threads = sparams->n_threads;
    }
    params.n_parallel = sparams->n_parallel;
    params.rope_freq_base = sparams->rope_freq_base;
    params.rope_freq_scale = sparams->rope_freq_scale;
    if (sparams->memory_f16) {
      params.cache_type_k = "f16";
      params.cache_type_v = "f16";
    } else {
      params.cache_type_k = "f32";
      params.cache_type_v = "f32";
    }
    params.n_gpu_layers = sparams->n_gpu_layers;
    params.main_gpu = sparams->main_gpu;
    params.use_mlock = sparams->use_mlock;
    params.use_mmap = sparams->use_mmap;
    params.numa = (ggml_numa_strategy)sparams->numa;
    params.embedding = sparams->embedding;
    if (sparams->model != NULL) {
      params.model = sparams->model;
    }
    if (sparams->lora_adapters != NULL) {
      for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
          la = la->next) {
        params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
      }
      params.use_mmap = false;
    }
    if (sparams->mmproj != NULL) {
      params.mmproj = std::string(sparams->mmproj);
    }
 #if defined(GGML_USE_CUBLAS)
    // Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
    LOG_TEE("Performing pre-initialization of GPU\n");
    int id;
    cudaError_t cudaErr = cudaGetDevice(&id);
    if (cudaErr != cudaSuccess) {
      err->id = -1;
      snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
      return;
    }
 #endif
    llama_backend_init();
    llama_numa_init(params.numa);
  if (!llama->load_model(params)) { 
    // an error occurred that was not thrown
    err->id = -1;
    snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
    return;
  }
    llama->initialize();
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
  } catch (...) {
    err->id = -1;
    snprintf(err->msg, err->msg_len,
             "Unknown exception initializing llama server");
  }
 }
 void llama_server_start() {
  assert(llama != NULL);
  // TODO mutex to protect thread creation
  ext_server_thread = std::thread([&]() {
    try {
      LOG_TEE("llama server main loop starting\n");
      ggml_time_init();
      llama->queue_tasks.on_new_task(std::bind(
        &llama_server_context::process_single_task, llama, std::placeholders::_1));
      llama->queue_tasks.on_finish_multitask(std::bind(
        &llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
      llama->queue_tasks.on_run_slots(std::bind(
        &llama_server_context::update_slots, llama));
      llama->queue_results.on_multitask_update(std::bind(
          &llama_server_queue::update_multitask,
          &llama->queue_tasks,
          std::placeholders::_1,
          std::placeholders::_2,
          std::placeholders::_3
        ));
      llama->queue_tasks.start_loop();
    } catch (std::exception &e) {
      LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
    } catch (...) {
      LOG_TEE("caught unknown exception in llama server main loop\n");
    }
    LOG_TEE("\nllama server shutting down\n");
    llama_backend_free();
  });
 }
 void llama_server_stop() {
  assert(llama != NULL);
  // Shutdown any in-flight requests and block incoming requests.
  LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
  shutting_down = true;
  while (recv_counter.load() > 0) {
    std::this_thread::sleep_for(std::chrono::milliseconds(50));
  }
  // This may take a while for any pending tasks to drain
  // TODO - consider a timeout to cancel tasks if it's taking too long
  llama->queue_tasks.terminate();
  ext_server_thread.join();
  delete llama;
  llama = NULL;
  LOG_TEE("llama server shutdown complete\n");
  shutting_down = false;
 }
 void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
  assert(llama != NULL && json_req != NULL && resp != NULL);
  resp->id = -1;
  resp->msg[0] = '\0';
  try {
    if (shutting_down) {
      throw std::runtime_error("server shutting down");
    }
    json data = json::parse(json_req);
    resp->id = llama->queue_tasks.get_new_id();
    llama->queue_results.add_waiting_task_id(resp->id);
    llama->request_completion(resp->id, data, false, false, -1);
  } catch (std::exception &e) {
    snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
  } catch (...) {
    snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
  }
 }
 void llama_server_completion_next_result(const int task_id,
                                         ext_server_task_result_t *resp) {
  assert(llama != NULL && resp != NULL);
  resp->id = -1;
  resp->stop = false;
  resp->error = false;
  resp->json_resp = NULL;
  std::string result_json;
  try {
    atomicRecv ar(recv_counter);
    task_result result = llama->queue_results.recv(task_id);
    result_json =
        result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
    resp->id = result.id;
    resp->stop = result.stop;
    resp->error = result.error;
    if (result.error) {
      LOG_TEE("next result cancel on error\n");
      llama->request_cancel(task_id);
      LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
      llama->queue_results.remove_waiting_task_id(task_id);
    } else if (result.stop) {
      LOG_TEE("next result cancel on stop\n");
      llama->request_cancel(task_id);
      LOG_TEE("next result removing waiting task ID: %d\n", task_id);
      llama->queue_results.remove_waiting_task_id(task_id);
    } else if (shutting_down) {
      LOG_TEE("aborting completion due to shutdown %d\n", task_id);
      llama->request_cancel(task_id);
      llama->queue_results.remove_waiting_task_id(task_id);
      resp->stop = true;
    }
  } catch (std::exception &e) {
    resp->error = true;
    resp->id = -1;
    result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
    LOG_TEE("llama server completion exception %s\n", e.what());
  } catch (...) {
    resp->error = true;
    resp->id = -1;
    result_json = "{\"error\":\"Unknown exception during completion\"}";
    LOG_TEE("llama server completion unknown exception\n");
  }
  const std::string::size_type size = result_json.size() + 1;
  resp->json_resp = new char[size];
  snprintf(resp->json_resp, size, "%s", result_json.c_str());
 }
 void llama_server_release_task_result(ext_server_task_result_t *result) {
  if (result == NULL || result->json_resp == NULL) {
    return;
  }
  delete[] result->json_resp;
 }
 void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
  assert(llama != NULL && err != NULL);
  err->id = 0;
  err->msg[0] = '\0';
  try {
    llama->request_cancel(task_id);
    llama->queue_results.remove_waiting_task_id(task_id);
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
  } catch (...) {
    err->id = -1;
    snprintf(err->msg, err->msg_len,
             "Unknown exception completion cancel in llama server");
  }
 }
 void llama_server_tokenize(const char *json_req, char **json_resp,
                           ext_server_resp_t *err) {
  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
  *json_resp = NULL;
  err->id = 0;
  err->msg[0] = '\0';
  try {
    if (shutting_down) {
      throw std::runtime_error("server shutting down");
    }
    const json body = json::parse(json_req);
    std::vector<llama_token> tokens;
    if (body.count("content") != 0) {
      tokens = llama->tokenize(body["content"], false);
    }
    const json data = format_tokenizer_response(tokens);
    std::string result_json = data.dump();
    const std::string::size_type size = result_json.size() + 1;
    *json_resp = new char[size];
    snprintf(*json_resp, size, "%s", result_json.c_str());
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
  } catch (...) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
  }
 }
 void llama_server_release_json_resp(char **json_resp) {
  if (json_resp == NULL || *json_resp == NULL) {
    return;
  }
  delete[] *json_resp;
 }
 void llama_server_detokenize(const char *json_req, char **json_resp,
                             ext_server_resp_t *err) {
  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
  *json_resp = NULL;
  err->id = 0;
  err->msg[0] = '\0';
  try {
    if (shutting_down) {
      throw std::runtime_error("server shutting down");
    }
    const json body = json::parse(json_req);
    std::string content;
    if (body.count("tokens") != 0) {
      const std::vector<llama_token> tokens = body["tokens"];
      content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
    }
    const json data = format_detokenized_response(content);
    std::string result_json = data.dump();
    const std::string::size_type size = result_json.size() + 1;
    *json_resp = new char[size];
    snprintf(*json_resp, size, "%s", result_json.c_str());
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
  } catch (...) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
  }
 }
 void llama_server_embedding(const char *json_req, char **json_resp,
                            ext_server_resp_t *err) {
  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
  *json_resp = NULL;
  err->id = 0;
  err->msg[0] = '\0';
  try {
    if (shutting_down) {
      throw std::runtime_error("server shutting down");
    }
    const json body = json::parse(json_req);
    json prompt;
    if (body.count("content") != 0) {
      prompt = body["content"];
    } else {
      prompt = "";
    }
    const int task_id = llama->queue_tasks.get_new_id();
    llama->queue_results.add_waiting_task_id(task_id);
    llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
    atomicRecv ar(recv_counter);
    task_result result = llama->queue_results.recv(task_id);
    std::string result_json = result.result_json.dump();
    const std::string::size_type size = result_json.size() + 1;
    *json_resp = new char[size];
    snprintf(*json_resp, size, "%s", result_json.c_str());
    llama->queue_results.remove_waiting_task_id(task_id);
  } catch (std::exception &e) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "exception %s", e.what());
  } catch (...) {
    err->id = -1;
    snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
  }
 }
--- a/llm/ext_server/ext_server.h
+++ b/llm/ext_server/ext_server.h
@ -1,95 +0,0 @@
 #if defined(LLAMA_SERVER_LIBRARY)
 #ifndef LLAMA_SERVER_H
 #define LLAMA_SERVER_H
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 int __main(int argc, char **argv);
 // This exposes extern C entrypoints into the llama_server
 // To enable the server compile with LLAMA_SERVER_LIBRARY
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct ext_server_resp {
  int id;          // < 0 on error
  size_t msg_len;  // caller must allocate msg and set msg_len
  char *msg;
 } ext_server_resp_t;
 // Allocated and freed by caller
 typedef struct ext_server_lora_adapter {
  char *adapter;
  float scale;
  struct ext_server_lora_adapter *next;
 } ext_server_lora_adapter_t;
 // Allocated and freed by caller
 typedef struct ext_server_params {
  char *model;
  uint32_t n_ctx;         // token context window, 0 = from model
  uint32_t n_batch;       // prompt processing maximum batch size
  uint32_t n_threads;     // number of threads to use for generation
  int32_t n_parallel;     // number of parallel sequences to decodewra
  float rope_freq_base;   // RoPE base frequency, 0 = from model
  float rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
  bool memory_f16;        // use f16 instead of f32 for memory kv
  int32_t n_gpu_layers;  // number of layers to store in VRAM (-1 - use default)
  int32_t main_gpu;      // the GPU that is used for scratch and small tensors
  bool use_mlock;        // force system to keep model in RAM
  bool use_mmap;         // use mmap if possible
  int numa;              // attempt optimizations that help on some NUMA systems
  bool embedding;        // get only sentence embedding
  ext_server_lora_adapter_t *lora_adapters;
  char *mmproj;
  bool verbose_logging;  // Enable verbose logging of the server
 } ext_server_params_t;
 typedef struct ext_server_task_result {
  int id;
  bool stop;
  bool error;
  char *json_resp;  // null terminated, memory managed by ext_server
 } ext_server_task_result_t;
 // Initialize the server once per process
 // err->id = 0 for success and err->msg[0] = NULL
 // err->id != 0 for failure, and err->msg contains error message
 void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
 // Run the main loop, called once per init
 void llama_server_start();
 // Stop the main loop and free up resources allocated in init and start.  Init
 // must be called again to reuse
 void llama_server_stop();
 // json_req null terminated string, memory managed by caller
 // resp->id >= 0 on success (task ID)
 // resp->id < 0 on error, and resp->msg contains error message
 void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
 // Caller must call llama_server_release_task_result to free resp->json_resp
 void llama_server_completion_next_result(const int task_id,
                                         ext_server_task_result_t *result);
 void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
 void llama_server_release_task_result(ext_server_task_result_t *result);
 // Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
 // 0
 void llama_server_tokenize(const char *json_req, char **json_resp,
                           ext_server_resp_t *err);
 void llama_server_detokenize(const char *json_req, char **json_resp,
                             ext_server_resp_t *err);
 void llama_server_embedding(const char *json_req, char **json_resp,
                            ext_server_resp_t *err);
 void llama_server_release_json_resp(char **json_resp);
 #ifdef __cplusplus
 }
 #endif
 #endif
 #endif  // LLAMA_SERVER_LIBRARY
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@ -2768,7 +2768,7 @@ inline void signal_handler(int signal) {
    shutdown_handler(signal);
 }
-int _main(int argc, char **argv)
+int main(int argc, char **argv)
 {
 #if SERVER_VERBOSE != 1
    log_disable();
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@ -14,7 +14,7 @@ init_vars() {
    LLAMACPP_DIR=../llama.cpp
    CMAKE_DEFS=""
-    CMAKE_TARGETS="--target ext_server"
+    CMAKE_TARGETS="--target ollama_llama_server"
    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
    else
@ -81,26 +81,23 @@ apply_patches() {
 build() {
    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
    mkdir -p ${BUILD_DIR}/lib/
    ls ${BUILD_DIR}
    g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
        ${GCC_ARCH} \
        ${WHOLE_ARCHIVE} ${BUILD_DIR}/ext_server/libext_server.a ${NO_WHOLE_ARCHIVE} \
        ${BUILD_DIR}/common/libcommon.a \
        ${BUILD_DIR}/libllama.a \
        -Wl,-rpath,\$ORIGIN \
        -lpthread -ldl -lm \
        ${EXTRA_LIBS}
 }
-compress_libs() {
+compress() {
    echo "Compressing payloads to reduce overall binary size..."
    pids=""
-    rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
+    rm -rf ${BUILD_DIR}/bin/*.gz
-    for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
+    for f in ${BUILD_DIR}/bin/* ; do
-        gzip -n --best -f ${lib} &
+        gzip -n --best -f ${f} &
        pids+=" $!"
    done
    # check for lib directory
    if [ -d ${BUILD_DIR}/lib ]; then
        for f in ${BUILD_DIR}/lib/* ; do
            gzip -n --best -f ${f} &
            pids+=" $!"
        done
    fi
    echo
    for pid in ${pids}; do
        wait $pid
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@ -18,21 +18,31 @@ sign() {
    fi
 }
-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
+COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
 case "${GOARCH}" in
 "amd64")
    COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
    # Static build for linking into the Go binary
    init_vars
    CMAKE_TARGETS="--target llama --target ggml"
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="../build/darwin/${ARCH}_static"
    echo "Building static library"
    build
    #
    # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
    #
    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
+    BUILD_DIR="../build/darwin/${ARCH}/cpu"
    echo "Building LCD CPU"
    build
-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib
+    sign ${BUILD_DIR}/lib/libext_server.dylib
-    compress_libs
+    compress
    #
    # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
@ -40,11 +50,11 @@ case "${GOARCH}" in
    #
    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
+    BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
    echo "Building AVX CPU"
    build
-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib
+    sign ${BUILD_DIR}/lib/libext_server.dylib
-    compress_libs
+    compress
    #
    # ~2013 CPU Dynamic library
@ -52,20 +62,30 @@ case "${GOARCH}" in
    #
    init_vars
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
+    BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
    echo "Building AVX2 CPU"
    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
    build
-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib
+    sign ${BUILD_DIR}/lib/libext_server.dylib
-    compress_libs
+    compress
    ;;
 "arm64")
    # Static build for linking into the Go binary
    init_vars
    CMAKE_TARGETS="--target llama --target ggml"
    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    BUILD_DIR="../build/darwin/${ARCH}_static"
    echo "Building static library"
    build
    init_vars
    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
+    BUILD_DIR="../build/darwin/${ARCH}/metal"
    EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
    build
-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
+    sign ${BUILD_DIR}/lib/libext_server.dylib
-    compress_libs
+    compress
    ;;
 *)
    echo "GOARCH must be set"
@ -75,3 +95,4 @@ case "${GOARCH}" in
 esac
 cleanup
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -57,16 +57,31 @@ init_vars
 git_module_setup
 apply_patches
 init_vars
 if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
    if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
        # Static build for linking into the Go binary
        init_vars
        CMAKE_TARGETS="--target llama --target ggml"
        CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
        BUILD_DIR="../build/linux/${ARCH}_static"
        echo "Building static library"
        build
    fi
    # Users building from source can tune the exact flags we pass to cmake for configuring
    # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
    if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
        init_vars
        echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
+        BUILD_DIR="../build/linux/${ARCH}/cpu"
        echo "Building custom CPU"
        build
-        compress_libs
+        compress
    else
        # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
        # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
@ -83,11 +98,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
            #
            # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
            #
            init_vars
            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
+            BUILD_DIR="../build/linux/${ARCH}/cpu"
            echo "Building LCD CPU"
            build
-            compress_libs
+            compress
        fi
        if [ "${ARCH}" == "x86_64" ]; then
@ -101,10 +117,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-                BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
+                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
                echo "Building AVX CPU"
                build
-                compress_libs
+                compress
            fi
            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
@ -114,10 +130,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                #
                init_vars
                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-                BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
+                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
                echo "Building AVX2 CPU"
                build
-                compress_libs
+                compress
            fi
        fi
    fi
@ -157,7 +173,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
        ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
    fi
    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
+    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    build
@ -165,20 +181,20 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
    #
    # TODO - in the future we may shift to packaging these separately and conditionally
    #        downloading them in the install script.
-    DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
+    DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
    for lib in libcudart.so libcublas.so libcublasLt.so ; do
        DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
        if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
-            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
+            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
        elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
-            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
+            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
        elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
-            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
+            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
        else
-            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
+            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
        fi
    done
-    compress_libs
+    compress
 fi
@ -201,23 +217,24 @@ if [ -d "${ROCM_PATH}" ]; then
    fi
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
+    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
    EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
    build
    # Record the ROCM dependencies
-    rm -f "${BUILD_DIR}/lib/deps.txt"
+    rm -f "${BUILD_DIR}/bin/deps.txt"
-    touch "${BUILD_DIR}/lib/deps.txt"
+    touch "${BUILD_DIR}/bin/deps.txt"
-    for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
-        echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
+        echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
    done
    # bomb out if for some reason we didn't get a few deps
-    if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then
+    if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
-        cat "${BUILD_DIR}/lib/deps.txt"
+        cat "${BUILD_DIR}/bin/deps.txt"
        echo "ERROR: deps file short"
        exit 1
    fi
-    compress_libs
+    compress
 fi
 cleanup
 echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -33,7 +33,7 @@ function init_vars {
        "-DBUILD_SHARED_LIBS=on",
        "-DLLAMA_NATIVE=off"
        )
-    $script:cmakeTargets = @("ext_server")
+    $script:cmakeTargets = @("ollama_llama_server")
    $script:ARCH = "amd64" # arm not yet supported.
    if ($env:CGO_CFLAGS -contains "-g") {
        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
@ -97,16 +97,14 @@ function apply_patches {
        }
        # Checkout each file
        Set-Location -Path ${script:llamacppDir}
        foreach ($file in $filePaths) {
-            git checkout $file
+            git -C "${script:llamacppDir}" checkout $file
        }
    }
    # Apply each patch
    foreach ($patch in $patches) {
-        Set-Location -Path ${script:llamacppDir}
+        git -C "${script:llamacppDir}" apply $patch.FullName
        git apply $patch.FullName
    }
 }
@ -115,41 +113,41 @@ function build {
    & cmake --version
    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
+    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-}
+    # Rearrange output to be consistent between different generators
-
+    if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
-function install {
+        mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
-    rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
+        remove-item "${script:buildDir}/bin/${script:config}"
    md "${script:buildDir}/lib" -ea 0 > $null
    cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
    cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
    # Display the dll dependencies in the build log
    if ($script:DUMPBIN -ne $null) {
        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
    }
 }
 function sign {
    if ("${env:KEY_CONTAINER}") {
-        write-host "Signing ${script:buildDir}/lib/*.dll"
+        write-host "Signing ${script:buildDir}/bin/*.exe  ${script:buildDir}/bin/*.dll"
-        foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){
+        foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
-            & "${script:SignTool}" sign /v /debug /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
+            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        }
    }
 }
-function compress_libs {
+function compress {
    if ($script:GZIP -eq $null) {
        write-host "gzip not installed, not compressing files"
        return
    }
    write-host "Compressing binaries..."
    $binaries = dir "${script:buildDir}/bin/*.exe"
    foreach ($file in $binaries) {
        & "$script:GZIP" --best -f $file
    }
    write-host "Compressing dlls..."
-    $libs = dir "${script:buildDir}/lib/*.dll"
+    $binaries = dir "${script:buildDir}/bin/*.dll"
-    foreach ($file in $libs) {
+    foreach ($file in $dlls) {
        & "$script:GZIP" --best -f $file
    }
 }
@ -164,14 +162,11 @@ function cleanup {
        }
        # Checkout each file
        Set-Location -Path ${script:llamacppDir}
        foreach ($file in $filePaths) {            
-            git checkout $file
+            git -C "${script:llamacppDir}" checkout $file
        }
        git -C "${script:llamacppDir}" checkout CMakeLists.txt
    }
    Set-Location "${script:llamacppDir}/"
    git checkout CMakeLists.txt
 }
 init_vars
@ -179,7 +174,6 @@ git_module_setup
 apply_patches
 # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
 # -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
 # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
 # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
@ -187,32 +181,46 @@ $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
 if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
 # GCC build for direct linking into the Go binary
 init_vars
 $script:cmakeTargets = @("llama", "ggml")
 $script:cmakeDefs = @(
    "-G", "MinGW Makefiles"
    "-DBUILD_SHARED_LIBS=off",
    "-DLLAMA_NATIVE=off",
    "-DLLAMA_AVX=off",
    "-DLLAMA_AVX2=off",
    "-DLLAMA_AVX512=off",
    "-DLLAMA_F16C=off",
    "-DLLAMA_FMA=off")
 $script:buildDir="../build/windows/${script:ARCH}_static"
 write-host "Building static library"
 build
 # remaining llama.cpp builds use MSVC 
    init_vars
    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
+    $script:buildDir="../build/windows/${script:ARCH}/cpu"
    write-host "Building LCD CPU"
    build
    install
    sign
-    compress_libs
+    compress
    init_vars
    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
+    $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
    write-host "Building AVX CPU"
    build
    install
    sign
-    compress_libs
+    compress
    init_vars
    $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
+    $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
    write-host "Building AVX2 CPU"
    build
    install
    sign
-    compress_libs
+    compress
 } else {
    write-host "Skipping CPU generation step as requested"
 }
@ -225,13 +233,11 @@ if ($null -ne $script:CUDA_LIB_DIR) {
        $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
    }
    init_vars
-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
+    $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
    $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
    write-host "Building CUDA"
    build
    install
    sign
-    compress_libs
+    compress
 }
 if ($null -ne $env:HIP_PATH) {
@ -241,7 +247,7 @@ if ($null -ne $env:HIP_PATH) {
    }
    init_vars
-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
+    $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
    $script:cmakeDefs += @(
        "-G", "Ninja", 
        "-DCMAKE_C_COMPILER=clang.exe",
@ -264,13 +270,13 @@ if ($null -ne $env:HIP_PATH) {
    build
    # Ninja doesn't prefix with config name
    ${script:config}=""
    install
    if ($null -ne $script:DUMPBIN) {
-        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
+        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
    }
    sign
-    compress_libs
+    compress
 }
 cleanup
-write-host "`ngo generate completed.  LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\llama.cpp\build\windows\${script:ARCH})"
+write-host "`ngo generate completed.  LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\build\windows\${script:ARCH})"
--- a/llm/generate/generate_darwin.go
+++ b/llm/generate/generate_darwin.go
@ -1,3 +1,3 @@
 package generate
-//go:generate sh ./gen_darwin.sh
+//go:generate bash ./gen_darwin.sh
--- a/llm/llama.go
+++ b/llm/llama.go
@ -1,100 +0,0 @@
 package llm
 import (
 	_ "embed"
 	"fmt"
 	"time"
 	"github.com/ollama/ollama/api"
 )
 const jsonGrammar = `
 root   ::= object
 value  ::= object | array | string | number | ("true" | "false" | "null") ws
 object ::=
  "{" ws (
            string ":" ws value
    ("," ws string ":" ws value)*
  )? "}" ws
 array  ::=
  "[" ws (
            value
    ("," ws value)*
  )? "]" ws
 string ::=
  "\"" (
    [^"\\] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws
 number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 # Optional space: by convention, applied in this grammar after literal chars when allowed
 ws ::= ([ \t\n] ws)?
 `
 type ImageData struct {
 	Data []byte `json:"data"`
 	ID   int    `json:"id"`
 }
 var payloadMissing = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
 type prediction struct {
 	Content string `json:"content"`
 	Model   string `json:"model"`
 	Prompt  string `json:"prompt"`
 	Stop    bool   `json:"stop"`
 	Timings struct {
 		PredictedN  int     `json:"predicted_n"`
 		PredictedMS float64 `json:"predicted_ms"`
 		PromptN     int     `json:"prompt_n"`
 		PromptMS    float64 `json:"prompt_ms"`
 	}
 }
 const maxRetries = 3
 type PredictOpts struct {
 	Prompt  string
 	Format  string
 	Images  []ImageData
 	Options api.Options
 }
 type PredictResult struct {
 	Content            string
 	Done               bool
 	PromptEvalCount    int
 	PromptEvalDuration time.Duration
 	EvalCount          int
 	EvalDuration       time.Duration
 }
 type TokenizeRequest struct {
 	Content string `json:"content"`
 }
 type TokenizeResponse struct {
 	Tokens []int `json:"tokens"`
 }
 type DetokenizeRequest struct {
 	Tokens []int `json:"tokens"`
 }
 type DetokenizeResponse struct {
 	Content string `json:"content"`
 }
 type EmbeddingRequest struct {
 	Content string `json:"content"`
 }
 type EmbeddingResponse struct {
 	Embedding []float64 `json:"embedding"`
 }
--- a/llm/llm.go
+++ b/llm/llm.go
@ -1,183 +1,15 @@
 package llm
-import (
+// #cgo CFLAGS: -Illama.cpp
-	"context"
+// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
-	"fmt"
+// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
-	"log/slog"
+// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
-	"os"
+// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
-	"slices"
+// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
-	"strings"
+// #include "llama.h"
 import "C"
-	"github.com/ollama/ollama/api"
+// SystemInfo is an unused example of calling llama.cpp functions using CGo
-	"github.com/ollama/ollama/format"
+func SystemInfo() string {
-	"github.com/ollama/ollama/gpu"
+	return C.GoString(C.llama_print_system_info())
 )
 type LLM interface {
 	Predict(context.Context, PredictOpts, func(PredictResult)) error
 	Embedding(context.Context, string) ([]float64, error)
 	Encode(context.Context, string) ([]int, error)
 	Decode(context.Context, []int) (string, error)
 	Close()
 }
 var cpuOnlyFamilies = []string{
 	"mamba",
 }
 func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
 	f, err := os.Open(model)
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()
 	ggml, _, err := DecodeGGML(f)
 	if err != nil {
 		return nil, err
 	}
 	if opts.NumCtx > int(ggml.KV().ContextLength()) {
 		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
 		opts.NumCtx = int(ggml.KV().ContextLength())
 	}
 	if opts.NumCtx < 4 {
 		opts.NumCtx = 4
 	}
 	availableMemory, _ := gpu.CheckVRAM()
 	info := gpu.GetGPUInfo()
 	usedMemory := info.MinimumMemory
 	for _, projector := range projectors {
 		usedMemory += projectorMemoryRequirements(projector)
 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
 	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
 	// this amount is the overhead + tensors in memory
 	// TODO: get this from the llama.cpp's graph calculations instead of
 	// estimating it's 1/6 * kv_cache_size * num_gqa
 	graph := int64(ggml.KV().GQA()) * kv / 6
 	usedMemory += graph
 	if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
 		info.Library = "cpu"
 	}
 	requiredMemory := usedMemory
 	var layers int
 	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
 		layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
 		requiredMemory += layerMemory
 		if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
 			usedMemory += layerMemory
 			layers++
 		}
 	}
 	memOutputLayer := ggml.LayerSize("output.")
 	requiredMemory += memOutputLayer
 	// only offload output layer if all repeating layers are offloaded
 	if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
 		usedMemory += memOutputLayer
 		layers++
 	}
 	slog.Info(
 		"offload to gpu",
 		"layers", layers,
 		"required", format.HumanBytes2(requiredMemory),
 		"used", format.HumanBytes2(usedMemory),
 		"available", format.HumanBytes2(availableMemory),
 		"kv", format.HumanBytes2(kv),
 		"graph", format.HumanBytes2(graph),
 	)
 	if opts.NumGPU < 0 && info.Library != "cpu" {
 		opts.NumGPU = layers
 	}
 	return newLlmServer(info, model, adapters, projectors, opts)
 }
 func projectorMemoryRequirements(filename string) int64 {
 	file, err := os.Open(filename)
 	if err != nil {
 		return 0
 	}
 	defer file.Close()
 	ggml, _, err := DecodeGGML(file)
 	if err != nil {
 		return 0
 	}
 	prefixes := make(map[string]struct{})
 	for _, layer := range ggml.Tensors() {
 		parts := strings.Split(layer.Name, ".")
 		prefixes[strings.Join(parts[:2], ".")] = struct{}{}
 	}
 	var ask int64
 	for prefix := range prefixes {
 		ask += ggml.LayerSize(prefix)
 	}
 	return ask
 }
 // Give any native cgo implementations an opportunity to initialize
 func Init() error {
 	return nativeInit()
 }
 func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
 	dynLibs := getDynLibs(gpuInfo)
 	// Check to see if the user has requested a specific library instead of auto-detecting
 	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
 	if demandLib != "" {
 		libPath := availableDynLibs[demandLib]
 		if libPath == "" {
 			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
 		} else {
 			slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
 			dynLibs = []string{libPath}
 		}
 	}
 	// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
 	_, err := os.Stat(dynLibs[0])
 	if err != nil {
 		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
 		err = nativeInit()
 		if err != nil {
 			return nil, err
 		}
 	}
 	err2 := fmt.Errorf("unable to locate suitable llm library")
 	for _, dynLib := range dynLibs {
 		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
 		if err == nil {
 			return srv, nil
 		}
 		slog.Warn(fmt.Sprintf("Failed to load dynamic library %s  %s", dynLib, err))
 		err2 = err
 	}
 	return nil, err2
 }
--- a/llm/llm_darwin_amd64.go
+++ b/llm/llm_darwin_amd64.go
@ -4,5 +4,5 @@ import (
 	"embed"
 )
-//go:embed llama.cpp/build/linux/*/*/lib/*
+//go:embed build/darwin/x86_64/*/bin/*
 var libEmbed embed.FS
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
@ -4,5 +4,5 @@ import (
 	"embed"
 )
-//go:embed llama.cpp/build/windows/*/*/lib/*.dll*
+//go:embed build/darwin/arm64/*/bin/*
 var libEmbed embed.FS
--- a/llm/llm_linux.go
+++ b/llm/llm_linux.go
@ -0,0 +1,6 @@
 package llm
 import "embed"
 //go:embed build/linux/*/*/bin/*
 var libEmbed embed.FS
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@ -0,0 +1,6 @@
 package llm
 import "embed"
 //go:embed build/windows/*/*/bin/*
 var libEmbed embed.FS
--- a/llm/payload.go
+++ b/llm/payload.go
@ -0,0 +1,211 @@
 package llm
 import (
 	"compress/gzip"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"strings"
 	"golang.org/x/exp/slices"
 	"golang.org/x/sync/errgroup"
 	"github.com/ollama/ollama/gpu"
 )
 var errPayloadMissing = fmt.Errorf("expected payloads not included in this build of ollama")
 func Init() error {
 	payloadsDir, err := gpu.PayloadsDir()
 	if err != nil {
 		return err
 	}
 	slog.Info("extracting embedded files", "dir", payloadsDir)
 	binGlob := "build/*/*/*/bin/*"
 	// extract server libraries
 	err = extractFiles(payloadsDir, binGlob)
 	if err != nil {
 		return fmt.Errorf("extract binaries: %v", err)
 	}
 	var variants []string
 	for v := range availableServers() {
 		variants = append(variants, v)
 	}
 	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
 	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
 	return nil
 }
 // binary names may contain an optional variant separated by '_'
 // For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
 // Any library without a variant is the lowest common denominator
 func availableServers() map[string]string {
 	payloadsDir, err := gpu.PayloadsDir()
 	if err != nil {
 		slog.Error("payload lookup error", "error", err)
 		return nil
 	}
 	// glob payloadsDir for files that start with ollama_
 	pattern := filepath.Join(payloadsDir, "*")
 	files, err := filepath.Glob(pattern)
 	if err != nil {
 		slog.Debug("could not glob", "pattern", pattern, "error", err)
 		return nil
 	}
 	servers := make(map[string]string)
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
 		servers[filepath.Base(file)] = file
 	}
 	return servers
 }
 // serversForGpu returns a list of compatible servers give the provided GPU
 // info, ordered by performance. assumes Init() has been called
 // TODO - switch to metadata based mapping
 func serversForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := availableServers()
 	requested := info.Library
 	if info.Variant != "" {
 		requested += "_" + info.Variant
 	}
 	servers := []string{}
 	// exact match first
 	for a := range availableServers {
 		if a == requested {
 			servers = []string{a}
 			if a == "metal" {
 				return servers
 			}
 			break
 		}
 	}
 	alt := []string{}
 	// Then for GPUs load alternates and sort the list for consistent load ordering
 	if info.Library != "cpu" {
 		for a := range availableServers {
 			if info.Library == strings.Split(a, "_")[0] && a != requested {
 				alt = append(alt, a)
 			}
 		}
 		slices.Sort(alt)
 		servers = append(servers, alt...)
 	}
 	// Load up the best CPU variant if not primary requested
 	if info.Library != "cpu" {
 		variant := gpu.GetCPUVariant()
 		// If no variant, then we fall back to default
 		// If we have a variant, try that if we find an exact match
 		// Attempting to run the wrong CPU instructions will panic the
 		// process
 		if variant != "" {
 			for cmp := range availableServers {
 				if cmp == "cpu_"+variant {
 					servers = append(servers, cmp)
 					break
 				}
 			}
 		} else {
 			servers = append(servers, "cpu")
 		}
 	}
 	if len(servers) == 0 {
 		servers = []string{"cpu"}
 	}
 	return servers
 }
 // extract extracts the embedded files to the target directory
 func extractFiles(targetDir string, glob string) error {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return errPayloadMissing
 	}
 	if err := os.MkdirAll(targetDir, 0o755); err != nil {
 		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
 	}
 	g := new(errgroup.Group)
 	// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
 	for _, file := range files {
 		filename := file
 		variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
 		slog.Debug("extracting", "variant", variant, "file", filename)
 		g.Go(func() error {
 			srcf, err := libEmbed.Open(filename)
 			if err != nil {
 				return err
 			}
 			defer srcf.Close()
 			src := io.Reader(srcf)
 			if strings.HasSuffix(filename, ".gz") {
 				src, err = gzip.NewReader(src)
 				if err != nil {
 					return fmt.Errorf("decompress payload %s: %v", filename, err)
 				}
 				filename = strings.TrimSuffix(filename, ".gz")
 			}
 			variantDir := filepath.Join(targetDir, variant)
 			if err := os.MkdirAll(variantDir, 0o755); err != nil {
 				return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
 			}
 			base := filepath.Base(filename)
 			destFilename := filepath.Join(variantDir, base)
 			_, err = os.Stat(destFilename)
 			switch {
 			case errors.Is(err, os.ErrNotExist):
 				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 				if err != nil {
 					return fmt.Errorf("write payload %s: %v", filename, err)
 				}
 				defer destFile.Close()
 				if _, err := io.Copy(destFile, src); err != nil {
 					return fmt.Errorf("copy payload %s: %v", filename, err)
 				}
 			case err != nil:
 				return fmt.Errorf("stat payload %s: %v", filename, err)
 			}
 			return nil
 		})
 	}
 	err = g.Wait()
 	if err != nil {
 		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
 		gpu.Cleanup()
 		return err
 	}
 	return nil
 }
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@ -1,233 +0,0 @@
 package llm
 import (
 	"compress/gzip"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strings"
 	"sync"
 	"golang.org/x/exp/slices"
 	"golang.org/x/sync/errgroup"
 	"github.com/ollama/ollama/gpu"
 )
 // Libraries names may contain an optional variant separated by '_'
 // For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
 // Any library without a variant is the lowest common denominator
 var availableDynLibs = map[string]string{}
 const pathComponentCount = 7
 // getDynLibs returns an ordered list of LLM libraries to try, starting with the best
 func getDynLibs(gpuInfo gpu.GpuInfo) []string {
 	// Short circuit if we know we're using the default built-in (darwin only)
 	if gpuInfo.Library == "default" {
 		return []string{"default"}
 	}
 	// TODO - temporary until we have multiple CPU variations for Darwin
 	// Short circuit on darwin with metal only
 	if len(availableDynLibs) == 1 {
 		if _, onlyMetal := availableDynLibs["metal"]; onlyMetal {
 			return []string{availableDynLibs["metal"]}
 		}
 	}
 	exactMatch := ""
 	dynLibs := []string{}
 	altDynLibs := []string{}
 	requested := gpuInfo.Library
 	if gpuInfo.Variant != "" {
 		requested += "_" + gpuInfo.Variant
 	}
 	// Try to find an exact match
 	for cmp := range availableDynLibs {
 		if requested == cmp {
 			exactMatch = cmp
 			dynLibs = []string{availableDynLibs[cmp]}
 			break
 		}
 	}
 	// Then for GPUs load alternates and sort the list for consistent load ordering
 	if gpuInfo.Library != "cpu" {
 		for cmp := range availableDynLibs {
 			if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
 				altDynLibs = append(altDynLibs, cmp)
 			}
 		}
 		slices.Sort(altDynLibs)
 		for _, altDynLib := range altDynLibs {
 			dynLibs = append(dynLibs, availableDynLibs[altDynLib])
 		}
 	}
 	// Load up the best CPU variant if not primary requested
 	if gpuInfo.Library != "cpu" {
 		variant := gpu.GetCPUVariant()
 		// If no variant, then we fall back to default
 		// If we have a variant, try that if we find an exact match
 		// Attempting to run the wrong CPU instructions will panic the
 		// process
 		if variant != "" {
 			for cmp := range availableDynLibs {
 				if cmp == "cpu_"+variant {
 					dynLibs = append(dynLibs, availableDynLibs[cmp])
 					break
 				}
 			}
 		} else {
 			dynLibs = append(dynLibs, availableDynLibs["cpu"])
 		}
 	}
 	// Finally, if we didn't find any matches, LCD CPU FTW
 	if len(dynLibs) == 0 {
 		dynLibs = []string{availableDynLibs["cpu"]}
 	}
 	slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
 	return dynLibs
 }
 func rocmDynLibPresent() bool {
 	for dynLibName := range availableDynLibs {
 		if strings.HasPrefix(dynLibName, "rocm") {
 			return true
 		}
 	}
 	return false
 }
 func nativeInit() error {
 	payloadsDir, err := gpu.PayloadsDir()
 	if err != nil {
 		return err
 	}
 	slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir))
 	libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*")
 	if err != nil {
 		if errors.Is(err, payloadMissing) {
 			slog.Info(fmt.Sprintf("%s", payloadMissing))
 			return nil
 		}
 		return err
 	}
 	for _, lib := range libs {
 		// The last dir component is the variant name
 		variant := filepath.Base(filepath.Dir(lib))
 		availableDynLibs[variant] = lib
 	}
 	if err := verifyDriverAccess(); err != nil {
 		return err
 	}
 	// Report which dynamic libraries we have loaded to assist troubleshooting
 	variants := make([]string, len(availableDynLibs))
 	i := 0
 	for variant := range availableDynLibs {
 		variants[i] = variant
 		i++
 	}
 	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
 	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
 	return nil
 }
 func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
 	files, err := fs.Glob(libEmbed, glob)
 	if err != nil || len(files) == 0 {
 		return nil, payloadMissing
 	}
 	var mu sync.Mutex
 	var libs []string
 	var g errgroup.Group
 	for _, file := range files {
 		pathComps := strings.Split(file, "/")
 		if len(pathComps) != pathComponentCount {
 			slog.Error(fmt.Sprintf("unexpected payload components: %v", pathComps))
 			continue
 		}
 		file := file
 		g.Go(func() error {
 			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
 			// Include the variant in the path to avoid conflicts between multiple server libs
 			targetDir := filepath.Join(payloadsDir, pathComps[pathComponentCount-3])
 			srcFile, err := libEmbed.Open(file)
 			if err != nil {
 				return fmt.Errorf("read payload %s: %v", file, err)
 			}
 			defer srcFile.Close()
 			if err := os.MkdirAll(targetDir, 0o755); err != nil {
 				return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err)
 			}
 			src := io.Reader(srcFile)
 			filename := file
 			if strings.HasSuffix(file, ".gz") {
 				src, err = gzip.NewReader(src)
 				if err != nil {
 					return fmt.Errorf("decompress payload %s: %v", file, err)
 				}
 				filename = strings.TrimSuffix(filename, ".gz")
 			}
 			destFile := filepath.Join(targetDir, filepath.Base(filename))
 			if strings.Contains(destFile, "server") {
 				mu.Lock()
 				libs = append(libs, destFile)
 				mu.Unlock()
 			}
 			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 			if err != nil {
 				return fmt.Errorf("write payload %s: %v", file, err)
 			}
 			defer destFp.Close()
 			if _, err := io.Copy(destFp, src); err != nil {
 				return fmt.Errorf("copy payload %s: %v", file, err)
 			}
 			return nil
 		})
 	}
 	err = g.Wait()
 	if err != nil {
 		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
 		gpu.Cleanup()
 		return nil, err
 	}
 	return libs, nil
 }
 func verifyDriverAccess() error {
 	if runtime.GOOS != "linux" {
 		return nil
 	}
 	// Only check ROCm access if we have the dynamic lib loaded
 	if rocmDynLibPresent() {
 		// Verify we have permissions - either running as root, or we have group access to the driver
 		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
 		if err != nil {
 			if errors.Is(err, fs.ErrPermission) {
 				return fmt.Errorf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
 			} else if errors.Is(err, fs.ErrNotExist) {
 				// expected behavior without a radeon card
 				return nil
 			}
 			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
 		}
 		fd.Close()
 	}
 	return nil
 }
--- a/llm/payload_darwin_amd64.go
+++ b/llm/payload_darwin_amd64.go
@ -1,8 +0,0 @@
 package llm
 import (
 	"embed"
 )
 //go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
 var libEmbed embed.FS
--- a/llm/payload_darwin_arm64.go
+++ b/llm/payload_darwin_arm64.go
@ -1,8 +0,0 @@
 package llm
 import (
 	"embed"
 )
 //go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
 var libEmbed embed.FS
--- a/llm/payload_test.go
+++ b/llm/payload_test.go
@ -1,58 +0,0 @@
 package llm
 import (
 	"testing"
 	"github.com/ollama/ollama/gpu"
 	"github.com/stretchr/testify/assert"
 )
 func TestGetDynLibs(t *testing.T) {
 	availableDynLibs = map[string]string{
 		"cpu": "X_cpu",
 	}
 	assert.Equal(t, false, rocmDynLibPresent())
 	res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
 	assert.Len(t, res, 1)
 	assert.Equal(t, availableDynLibs["cpu"], res[0])
 	variant := gpu.GetCPUVariant()
 	if variant != "" {
 		variant = "_" + variant
 	}
 	availableDynLibs = map[string]string{
 		"rocm_v5":       "X_rocm_v5",
 		"rocm_v6":       "X_rocm_v6",
 		"cpu" + variant: "X_cpu",
 	}
 	assert.Equal(t, true, rocmDynLibPresent())
 	res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
 	assert.Len(t, res, 3)
 	assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
 	assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
 	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
 	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
 	assert.Len(t, res, 3)
 	assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
 	assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
 	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
 	res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
 	assert.Len(t, res, 1)
 	assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
 	res = getDynLibs(gpu.GpuInfo{Library: "default"})
 	assert.Len(t, res, 1)
 	assert.Equal(t, "default", res[0])
 	availableDynLibs = map[string]string{
 		"rocm":          "X_rocm_v5",
 		"cpu" + variant: "X_cpu",
 	}
 	assert.Equal(t, true, rocmDynLibPresent())
 	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
 	assert.Len(t, res, 2)
 	assert.Equal(t, availableDynLibs["rocm"], res[0])
 	assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
 }
--- a/llm/server.go
+++ b/llm/server.go
@ -0,0 +1,854 @@
 package llm
 import (
 	"bufio"
 	"bytes"
 	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
 	"log"
 	"log/slog"
 	"math/rand"
 	"net"
 	"net/http"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"runtime"
 	"slices"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 )
 // LlamaServer is an instance of the llama.cpp server
 type LlamaServer struct {
 	port    int
 	cmd     *exec.Cmd
 	done    chan error // Channel to signal when the process exits
 	status  *StatusWriter
 	options *api.Options
 }
 var cpuOnlyFamilies = []string{
 	"mamba",
 }
 func NewLlamaServer(model string, adapters, projectors []string, opts *api.Options) (*LlamaServer, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
 	f, err := os.Open(model)
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()
 	ggml, _, err := DecodeGGML(f)
 	if err != nil {
 		return nil, err
 	}
 	if opts.NumCtx > int(ggml.KV().ContextLength()) {
 		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
 		opts.NumCtx = int(ggml.KV().ContextLength())
 	}
 	if opts.NumCtx < 4 {
 		opts.NumCtx = 4
 	}
 	availableMemory, _ := gpu.CheckVRAM()
 	info := gpu.GetGPUInfo()
 	usedMemory := info.MinimumMemory
 	for _, projector := range projectors {
 		usedMemory += projectorMemoryRequirements(projector)
 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
 	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
 	// this amount is the overhead + tensors in memory
 	// TODO: get this from the llama.cpp's graph calculations instead of
 	// estimating it's 1/6 * kv_cache_size * num_gqa
 	graph := int64(ggml.KV().GQA()) * kv / 6
 	usedMemory += graph
 	if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
 		info.Library = "cpu"
 	}
 	requiredMemory := usedMemory
 	var layers int
 	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
 		layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
 		requiredMemory += layerMemory
 		if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
 			usedMemory += layerMemory
 			layers++
 		}
 	}
 	memOutputLayer := ggml.LayerSize("output.")
 	requiredMemory += memOutputLayer
 	// only offload output layer if all repeating layers are offloaded
 	if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
 		usedMemory += memOutputLayer
 		layers++
 	}
 	slog.Info(
 		"offload to gpu",
 		"layers", layers,
 		"required", format.HumanBytes2(requiredMemory),
 		"used", format.HumanBytes2(usedMemory),
 		"available", format.HumanBytes2(availableMemory),
 		"kv", format.HumanBytes2(kv),
 		"graph", format.HumanBytes2(graph),
 	)
 	if opts.NumGPU < 0 && info.Library != "cpu" {
 		opts.NumGPU = layers
 	}
 	if len(adapters) > 1 {
 		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
 	}
 	availableServers := availableServers()
 	servers := serversForGpu(info)
 	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
 	if demandLib != "" {
 		serverPath := availableServers[demandLib]
 		if serverPath == "" {
 			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
 		} else {
 			slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
 			servers = []string{demandLib}
 		}
 	}
 	if len(servers) == 0 {
 		return nil, fmt.Errorf("no servers found for %v", info)
 	}
 	params := []string{
 		"--model", model,
 		"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
 		"--embedding",
 	}
 	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
 		params = append(params, "--log-format", "json")
 	} else {
 		params = append(params, "--log-disable")
 	}
 	if opts.NumGPU > 0 {
 		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
 	}
 	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
 		params = append(params, "--verbose")
 	}
 	if opts.MainGPU > 0 {
 		params = append(params, "--main-gpu", fmt.Sprintf("%d", opts.MainGPU))
 	}
 	if opts.RopeFrequencyBase > 0 {
 		params = append(params, "--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase))
 	}
 	if opts.RopeFrequencyScale > 0 {
 		params = append(params, "--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale))
 	}
 	if len(adapters) > 0 {
 		// TODO: applying multiple adapters is not supported by the llama.cpp server yet
 		params = append(params, "--lora", adapters[0])
 	}
 	if len(projectors) > 0 {
 		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
 		params = append(params, "--mmproj", projectors[0])
 	}
 	if opts.NumThread > 0 {
 		params = append(params, "--threads", fmt.Sprintf("%d", opts.NumThread))
 	}
 	if !opts.F16KV {
 		params = append(params, "--memory-f32")
 	}
 	if opts.UseMLock {
 		params = append(params, "--mlock")
 	}
 	if !opts.UseMMap {
 		params = append(params, "--no-mmap")
 	}
 	if opts.UseNUMA {
 		params = append(params, "--numa")
 	}
 	// Loop through potential servers
 	var finalErr error
 	for i := 0; i < len(servers); i++ {
 		dir := availableServers[servers[i]]
 		// Find an availableServers  port, retry on each iterration in case the failure was a port conflict race
 		port := 0
 		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
 			var l *net.TCPListener
 			if l, err = net.ListenTCP("tcp", a); err == nil {
 				port = l.Addr().(*net.TCPAddr).Port
 				l.Close()
 			}
 		}
 		if port == 0 {
 			slog.Debug("ResolveTCPAddr failed ", "error", err)
 			port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
 		}
 		finalParams := append(params, "--port", strconv.Itoa(port))
 		pathEnv := "LD_LIBRARY_PATH"
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
 		// append the server directory to LD_LIBRARY_PATH/PATH
 		libraryPaths := []string{dir}
 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
 			// Append our runner directory to the path
 			// This will favor system libraries over our bundled library dependencies
 			libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...)
 		}
 		server := filepath.Join(dir, "ollama_llama_server")
 		if runtime.GOOS == "windows" {
 			server = server + ".exe"
 		}
 		s := &LlamaServer{
 			port:    port,
 			cmd:     exec.Command(server, finalParams...),
 			status:  NewStatusWriter(os.Stderr),
 			options: opts,
 		}
 		libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator)))
 		slog.Debug(libEnv)
 		s.cmd.Env = append(os.Environ(), libEnv)
 		s.cmd.Stdout = os.Stdout
 		s.cmd.Stderr = s.status
 		slog.Info("starting llama server", "cmd", s.cmd.String())
 		if err = s.cmd.Start(); err != nil {
 			msg := ""
 			if s.status != nil && s.status.LastErrMsg != "" {
 				msg = s.status.LastErrMsg
 			}
 			err = fmt.Errorf("error starting the external llama server: %v %s", err, msg)
 			finalErr = err
 			continue
 		}
 		// reap subprocess when it exits
 		go func() {
 			// Exit status managed via getServerStatus
 			_ = s.cmd.Wait()
 		}()
 		if err = s.waitUntilRunning(); err != nil {
 			slog.Error("error starting llama server", "server", servers[i], "error", err)
 			s.Close()
 			finalErr = err
 			continue
 		}
 		return s, nil
 	}
 	slog.Error("unable to load any llama server", "error", finalErr)
 	return nil, finalErr
 }
 func projectorMemoryRequirements(filename string) int64 {
 	file, err := os.Open(filename)
 	if err != nil {
 		return 0
 	}
 	defer file.Close()
 	ggml, _, err := DecodeGGML(file)
 	if err != nil {
 		return 0
 	}
 	prefixes := make(map[string]struct{})
 	for _, layer := range ggml.Tensors() {
 		parts := strings.Split(layer.Name, ".")
 		prefixes[strings.Join(parts[:2], ".")] = struct{}{}
 	}
 	var ask int64
 	for prefix := range prefixes {
 		ask += ggml.LayerSize(prefix)
 	}
 	return ask
 }
 type ServerStatus int
 const ( // iota is reset to 0
 	ServerStatusReady ServerStatus = iota
 	ServerStatusNoSlotsAvaialble
 	ServerStatusLoadingModel
 	ServerStatusNotResponding
 	ServerStatusError
 )
 type ServerStatusResp struct {
 	Status          string `json:"status"`
 	SlotsIdle       int    `json:"slots_idle"`
 	SlotsProcessing int    `json:"slots_processing"`
 	Error           string `json:"error"`
 }
 func (s *LlamaServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
 	// Fail fast if its exited
 	if s.cmd.ProcessState != nil {
 		msg := ""
 		if s.status != nil && s.status.LastErrMsg != "" {
 			msg = s.status.LastErrMsg
 		}
 		return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
 	}
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/health", s.port), nil)
 	if err != nil {
 		return ServerStatusError, fmt.Errorf("error creating GET request: %v", err)
 	}
 	req.Header.Set("Content-Type", "application/json")
 	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
 		if errors.Is(err, context.DeadlineExceeded) {
 			return ServerStatusNotResponding, fmt.Errorf("server not responding")
 		}
 		return ServerStatusError, fmt.Errorf("health resp: %w", err)
 	}
 	defer resp.Body.Close()
 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return ServerStatusError, fmt.Errorf("read health request: %w", err)
 	}
 	var status ServerStatusResp
 	if err := json.Unmarshal(body, &status); err != nil {
 		return ServerStatusError, fmt.Errorf("health unmarshal encode response: %w", err)
 	}
 	switch status.Status {
 	case "ok":
 		return ServerStatusReady, nil
 	case "no slot available":
 		return ServerStatusNoSlotsAvaialble, nil
 	case "loading model":
 		return ServerStatusLoadingModel, nil
 	default:
 		return ServerStatusError, fmt.Errorf("server error: %+v", status)
 	}
 }
 func (s *LlamaServer) Ping(ctx context.Context) error {
 	_, err := s.getServerStatus(ctx)
 	if err != nil {
 		slog.Debug("server unhealthy", "error", err)
 		return err
 	}
 	return nil
 }
 func (s *LlamaServer) waitUntilRunning() error {
 	start := time.Now()
 	expiresAt := time.Now().Add(3 * time.Minute) // be generous with timeout, large models can take a while to load
 	ticker := time.NewTicker(50 * time.Millisecond)
 	defer ticker.Stop()
 	slog.Info("waiting for llama runner to start responding")
 	var lastStatus ServerStatus = -1
 	for {
 		select {
 		case err := <-s.done:
 			msg := ""
 			if s.status != nil && s.status.LastErrMsg != "" {
 				msg = s.status.LastErrMsg
 			}
 			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
 		case <-ticker.C:
 			if time.Now().After(expiresAt) {
 				// timeout
 				msg := ""
 				if s.status != nil && s.status.LastErrMsg != "" {
 					msg = s.status.LastErrMsg
 				}
 				return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
 			}
 			if s.cmd.ProcessState != nil {
 				msg := ""
 				if s.status != nil && s.status.LastErrMsg != "" {
 					msg = s.status.LastErrMsg
 				}
 				return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
 			}
 			ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
 			defer cancel()
 			status, err := s.getServerStatus(ctx)
 			if err != nil && lastStatus != status {
 				slog.Debug("server not yet available", "error", err)
 				lastStatus = status
 				continue
 			}
 			switch status {
 			case ServerStatusLoadingModel:
 				// TODO - this state never seems to happen with the current server.cpp code (bug?)
 				// it doesn't respond to the health endpoint until after the model is loaded
 				slog.Debug("loading model")
 			case ServerStatusReady:
 				slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds()))
 				return nil
 			}
 		}
 	}
 }
 const jsonGrammar = `
 root   ::= object
 value  ::= object | array | string | number | ("true" | "false" | "null") ws
 object ::=
  "{" ws (
            string ":" ws value
    ("," ws string ":" ws value)*
  )? "}" ws
 array  ::=
  "[" ws (
            value
    ("," ws value)*
  )? "]" ws
 string ::=
  "\"" (
    [^"\\] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws
 number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 # Optional space: by convention, applied in this grammar after literal chars when allowed
 ws ::= ([ \t\n] ws)?
 `
 const maxBufferSize = 512 * format.KiloByte
 const maxRetries = 3
 type ImageData struct {
 	Data []byte `json:"data"`
 	ID   int    `json:"id"`
 }
 type completion struct {
 	Content string `json:"content"`
 	Model   string `json:"model"`
 	Prompt  string `json:"prompt"`
 	Stop    bool   `json:"stop"`
 	Timings struct {
 		PredictedN  int     `json:"predicted_n"`
 		PredictedMS float64 `json:"predicted_ms"`
 		PromptN     int     `json:"prompt_n"`
 		PromptMS    float64 `json:"prompt_ms"`
 	}
 }
 type CompletionRequest struct {
 	Prompt  string
 	Format  string
 	Images  []ImageData
 	Options api.Options
 }
 type CompletionResponse struct {
 	Content            string
 	Done               bool
 	PromptEvalCount    int
 	PromptEvalDuration time.Duration
 	EvalCount          int
 	EvalDuration       time.Duration
 }
 func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 	request := map[string]any{
 		"prompt":            req.Prompt,
 		"stream":            true,
 		"n_predict":         req.Options.NumPredict,
 		"n_keep":            req.Options.NumKeep,
 		"main_gpu":          req.Options.MainGPU,
 		"temperature":       req.Options.Temperature,
 		"top_k":             req.Options.TopK,
 		"top_p":             req.Options.TopP,
 		"tfs_z":             req.Options.TFSZ,
 		"typical_p":         req.Options.TypicalP,
 		"repeat_last_n":     req.Options.RepeatLastN,
 		"repeat_penalty":    req.Options.RepeatPenalty,
 		"presence_penalty":  req.Options.PresencePenalty,
 		"frequency_penalty": req.Options.FrequencyPenalty,
 		"mirostat":          req.Options.Mirostat,
 		"mirostat_tau":      req.Options.MirostatTau,
 		"mirostat_eta":      req.Options.MirostatEta,
 		"penalize_nl":       req.Options.PenalizeNewline,
 		"seed":              req.Options.Seed,
 		"stop":              req.Options.Stop,
 		"image_data":        req.Images,
 		"cache_prompt":      true,
 	}
 	// Make sure the server is ready
 	status, err := s.getServerStatus(ctx)
 	if err != nil {
 		return err
 	} else if status != ServerStatusReady {
 		return fmt.Errorf("unexpected server status: %d", status)
 	}
 	if req.Format == "json" {
 		request["grammar"] = jsonGrammar
 		if !strings.Contains(strings.ToLower(req.Prompt), "json") {
 			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
 		}
 	}
 	retryDelay := 100 * time.Microsecond
 	for retries := 0; retries < maxRetries; retries++ {
 		if retries > 0 {
 			time.Sleep(retryDelay) // wait before retrying
 			retryDelay *= 2        // exponential backoff
 		}
 		// Handling JSON marshaling with special characters unescaped.
 		buffer := &bytes.Buffer{}
 		enc := json.NewEncoder(buffer)
 		enc.SetEscapeHTML(false)
 		if err := enc.Encode(request); err != nil {
 			return fmt.Errorf("failed to marshal data: %v", err)
 		}
 		endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
 		req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
 		if err != nil {
 			return fmt.Errorf("error creating POST request: %v", err)
 		}
 		req.Header.Set("Content-Type", "application/json")
 		resp, err := http.DefaultClient.Do(req)
 		if err != nil {
 			return fmt.Errorf("POST predict: %v", err)
 		}
 		defer resp.Body.Close()
 		if resp.StatusCode >= 400 {
 			bodyBytes, err := io.ReadAll(resp.Body)
 			if err != nil {
 				return fmt.Errorf("failed reading llm error response: %w", err)
 			}
 			log.Printf("llm predict error: %s", bodyBytes)
 			return fmt.Errorf("%s", bodyBytes)
 		}
 		scanner := bufio.NewScanner(resp.Body)
 		buf := make([]byte, 0, maxBufferSize)
 		scanner.Buffer(buf, maxBufferSize)
 		retryNeeded := false
 		// keep track of the last token generated, this is used to abort if the model starts looping
 		var lastToken string
 		var tokenRepeat int
 		for scanner.Scan() {
 			select {
 			case <-ctx.Done():
 				// This handles the request cancellation
 				return ctx.Err()
 			default:
 				line := scanner.Bytes()
 				if len(line) == 0 {
 					continue
 				}
 				// try again on slot unavailable
 				if bytes.Contains(line, []byte("slot unavailable")) {
 					retryNeeded = true
 					break
 				}
 				evt, ok := bytes.CutPrefix(line, []byte("data: "))
 				if !ok {
 					return fmt.Errorf("error parsing llm response stream: %s", line)
 				}
 				var c completion
 				if err := json.Unmarshal(evt, &c); err != nil {
 					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
 				}
 				switch {
 				case strings.TrimSpace(c.Content) == lastToken:
 					tokenRepeat++
 				default:
 					lastToken = strings.TrimSpace(c.Content)
 					tokenRepeat = 0
 				}
 				// 30 picked as an arbitrary max token repeat limit, modify as needed
 				if tokenRepeat > 30 {
 					slog.Debug("prediction aborted, token repeat limit reached")
 					return ctx.Err()
 				}
 				if c.Content != "" {
 					fn(CompletionResponse{
 						Content: c.Content,
 					})
 				}
 				if c.Stop {
 					fn(CompletionResponse{
 						Done:               true,
 						PromptEvalCount:    c.Timings.PromptN,
 						PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
 						EvalCount:          c.Timings.PredictedN,
 						EvalDuration:       parseDurationMs(c.Timings.PredictedMS),
 					})
 					return nil
 				}
 			}
 		}
 		if err := scanner.Err(); err != nil {
 			if strings.Contains(err.Error(), "unexpected EOF") {
 				s.Close()
 				msg := ""
 				if s.status != nil && s.status.LastErrMsg != "" {
 					msg = s.status.LastErrMsg
 				}
 				return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
 			}
 			return fmt.Errorf("error reading llm response: %v", err)
 		}
 		if !retryNeeded {
 			return nil // success
 		}
 	}
 	// should never reach here ideally
 	return fmt.Errorf("max retries exceeded")
 }
 type EmbeddingRequest struct {
 	Content string `json:"content"`
 }
 type EmbeddingResponse struct {
 	Embedding []float64 `json:"embedding"`
 }
 func (s *LlamaServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
 	// Make sure the server is ready
 	status, err := s.getServerStatus(ctx)
 	if err != nil {
 		return nil, err
 	} else if status != ServerStatusReady {
 		return nil, fmt.Errorf("unexpected server status: %d", status)
 	}
 	data, err := json.Marshal(TokenizeRequest{Content: prompt})
 	if err != nil {
 		return nil, fmt.Errorf("error marshaling embed data: %w", err)
 	}
 	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
 	if err != nil {
 		return nil, fmt.Errorf("error creating embed request: %w", err)
 	}
 	req.Header.Set("Content-Type", "application/json")
 	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("do embedding request: %w", err)
 	}
 	defer resp.Body.Close()
 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return nil, fmt.Errorf("error reading embed response: %w", err)
 	}
 	if resp.StatusCode >= 400 {
 		log.Printf("llm encode error: %s", body)
 		return nil, fmt.Errorf("%s", body)
 	}
 	var embedding EmbeddingResponse
 	if err := json.Unmarshal(body, &embedding); err != nil {
 		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}
 	return embedding.Embedding, nil
 }
 type TokenizeRequest struct {
 	Content string `json:"content"`
 }
 type TokenizeResponse struct {
 	Tokens []int `json:"tokens"`
 }
 func (s *LlamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
 	// Make sure the server is ready
 	status, err := s.getServerStatus(ctx)
 	if err != nil {
 		return nil, err
 	} else if status != ServerStatusReady {
 		return nil, fmt.Errorf("unexpected server status: %d", status)
 	}
 	data, err := json.Marshal(TokenizeRequest{Content: content})
 	if err != nil {
 		return nil, fmt.Errorf("marshaling encode data: %w", err)
 	}
 	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data))
 	if err != nil {
 		return nil, fmt.Errorf("encode request: %w", err)
 	}
 	req.Header.Set("Content-Type", "application/json")
 	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("do encode request: %w", err)
 	}
 	defer resp.Body.Close()
 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return nil, fmt.Errorf("read encode request: %w", err)
 	}
 	if resp.StatusCode >= 400 {
 		log.Printf("llm encode error: %s", body)
 		return nil, fmt.Errorf("%s", body)
 	}
 	var encoded TokenizeResponse
 	if err := json.Unmarshal(body, &encoded); err != nil {
 		return nil, fmt.Errorf("unmarshal encode response: %w", err)
 	}
 	return encoded.Tokens, nil
 }
 type DetokenizeRequest struct {
 	Tokens []int `json:"tokens"`
 }
 type DetokenizeResponse struct {
 	Content string `json:"content"`
 }
 func (s *LlamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
 	// Make sure the server is ready
 	status, err := s.getServerStatus(ctx)
 	if err != nil {
 		return "", err
 	} else if status != ServerStatusReady {
 		return "", fmt.Errorf("unexpected server status: %d", status)
 	}
 	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
 	if err != nil {
 		return "", fmt.Errorf("marshaling decode data: %w", err)
 	}
 	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data))
 	if err != nil {
 		return "", fmt.Errorf("decode request: %w", err)
 	}
 	req.Header.Set("Content-Type", "application/json")
 	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
 		return "", fmt.Errorf("do decode request: %w", err)
 	}
 	defer resp.Body.Close()
 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return "", fmt.Errorf("read decode request: %w", err)
 	}
 	if resp.StatusCode >= 400 {
 		log.Printf("llm decode error: %s", body)
 		return "", fmt.Errorf("%s", body)
 	}
 	var decoded DetokenizeResponse
 	if err := json.Unmarshal(body, &decoded); err != nil {
 		return "", fmt.Errorf("unmarshal encode response: %w", err)
 	}
 	return decoded.Content, nil
 }
 func (s *LlamaServer) Close() error {
 	if s.cmd != nil {
 		slog.Debug("stopping llama server")
 		return s.cmd.Process.Kill()
 	}
 	return nil
 }
 func parseDurationMs(ms float64) time.Duration {
 	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
 	if err != nil {
 		panic(err)
 	}
 	return dur
 }
--- a/llm/status.go
+++ b/llm/status.go
@ -0,0 +1,42 @@
 package llm
 import (
 	"bytes"
 	"os"
 )
 // StatusWriter is a writer that captures error messages from the llama runner process
 type StatusWriter struct {
 	LastErrMsg string
 	out        *os.File
 }
 func NewStatusWriter(out *os.File) *StatusWriter {
 	return &StatusWriter{
 		out: out,
 	}
 }
 // TODO - regex matching to detect errors like
 // libcublasLt.so.11: cannot open shared object file: No such file or directory
 var errorPrefixes = []string{
 	"error:",
 	"CUDA error",
 	"cudaMalloc failed",
 	"\"ERR\"",
 }
 func (w *StatusWriter) Write(b []byte) (int, error) {
 	var errMsg string
 	for _, prefix := range errorPrefixes {
 		if _, after, ok := bytes.Cut(b, []byte(prefix)); ok {
 			errMsg = prefix + string(bytes.TrimSpace(after))
 		}
 	}
 	if errMsg != "" {
 		w.LastErrMsg = errMsg
 	}
 	return w.out.Write(b)
 }
--- a/llm/utils.go
+++ b/llm/utils.go
@ -1,15 +0,0 @@
 package llm
 import (
 	"fmt"
 	"time"
 )
 func parseDurationMs(ms float64) time.Duration {
 	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
 	if err != nil {
 		panic(err)
 	}
 	return dur
 }
--- a/server/routes.go
+++ b/server/routes.go
@ -56,12 +56,13 @@ func init() {
 var loaded struct {
 	mu sync.Mutex
-	runner llm.LLM
+	llama *llm.LlamaServer
 	expireAt    time.Time
 	expireTimer *time.Timer
-	*Model
+	model      string
 	adapters   []string
 	projectors []string
 	*api.Options
 }
@ -69,21 +70,28 @@ var defaultSessionDuration = 5 * time.Minute
 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
 func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.Duration) error {
-	needLoad := loaded.runner == nil || // is there a model loaded?
+	ctx, cancel := context.WithTimeout(c, 10*time.Second)
-		loaded.ModelPath != model.ModelPath || // has the base model changed?
+	defer cancel()
-		!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
+
-		!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) // have the runner options changed?
+	needLoad := loaded.llama == nil || // is there a model loaded?
 		loaded.model != model.ModelPath || // has the base model changed?
 		!reflect.DeepEqual(loaded.adapters, model.AdapterPaths) || // have the adapters changed?
 		!reflect.DeepEqual(loaded.projectors, model.ProjectorPaths) || // have the adapters changed?
 		!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) || // have the runner options changed?
 		loaded.llama.Ping(ctx) != nil
 	if needLoad {
-		if loaded.runner != nil {
+		if loaded.llama != nil {
 			slog.Info("changing loaded model")
-			loaded.runner.Close()
+			loaded.llama.Close()
-			loaded.runner = nil
+			loaded.llama = nil
-			loaded.Model = nil
+			loaded.model = ""
 			loaded.adapters = nil
 			loaded.projectors = nil
 			loaded.Options = nil
 		}
-		llmRunner, err := llm.New(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
+		llama, err := llm.NewLlamaServer(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
 		if err != nil {
 			// some older models are not compatible with newer versions of llama.cpp
 			// show a generalized compatibility error until there is a better way to
@ -95,28 +103,26 @@ func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.
 			return err
 		}
-		loaded.Model = model
+		loaded.model = model.ModelPath
-		loaded.runner = llmRunner
+		loaded.adapters = model.AdapterPaths
 		loaded.projectors = model.ProjectorPaths
 		loaded.llama = llama
 		loaded.Options = opts
 	}
 	loaded.expireAt = time.Now().Add(sessionDuration)
 	if loaded.expireTimer == nil {
 		loaded.expireTimer = time.AfterFunc(sessionDuration, func() {
 			loaded.mu.Lock()
 			defer loaded.mu.Unlock()
-			if time.Now().Before(loaded.expireAt) {
+			if loaded.llama != nil {
-				return
+				loaded.llama.Close()
 			}
-			if loaded.runner != nil {
+			loaded.llama = nil
-				loaded.runner.Close()
+			loaded.model = ""
-			}
+			loaded.adapters = nil
-
+			loaded.projectors = nil
 			loaded.runner = nil
 			loaded.Model = nil
 			loaded.Options = nil
 		})
 	}
@ -265,7 +271,7 @@ func GenerateHandler(c *gin.Context) {
 		sb.Reset()
 		if req.Context != nil {
-			prev, err := loaded.runner.Decode(c.Request.Context(), req.Context)
+			prev, err := loaded.llama.Detokenize(c.Request.Context(), req.Context)
 			if err != nil {
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
@ -286,9 +292,8 @@ func GenerateHandler(c *gin.Context) {
 	go func() {
 		defer close(ch)
-		fn := func(r llm.PredictResult) {
+		fn := func(r llm.CompletionResponse) {
 			// Update model expiration
 			loaded.expireAt = time.Now().Add(sessionDuration)
 			loaded.expireTimer.Reset(sessionDuration)
 			// Build up the full response
@ -322,7 +327,7 @@ func GenerateHandler(c *gin.Context) {
 					}
 					// TODO (jmorganca): encode() should not strip special tokens
-					tokens, err := loaded.runner.Encode(c.Request.Context(), p)
+					tokens, err := loaded.llama.Tokenize(c.Request.Context(), p)
 					if err != nil {
 						ch <- gin.H{"error": err.Error()}
 						return
@ -344,13 +349,13 @@ func GenerateHandler(c *gin.Context) {
 		}
 		// Start prediction
-		predictReq := llm.PredictOpts{
+		req := llm.CompletionRequest{
 			Prompt:  prompt,
 			Format:  req.Format,
 			Images:  images,
 			Options: opts,
 		}
-		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
+		if err := loaded.llama.Completion(c.Request.Context(), req, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@ -471,7 +476,7 @@ func EmbeddingsHandler(c *gin.Context) {
 		return
 	}
-	embedding, err := loaded.runner.Embedding(c.Request.Context(), req.Prompt)
+	embedding, err := loaded.llama.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
 		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
 		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
@ -1123,8 +1128,8 @@ func Serve(ln net.Listener) error {
 	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
 	go func() {
 		<-signals
-		if loaded.runner != nil {
+		if loaded.llama != nil {
-			loaded.runner.Close()
+			loaded.llama.Close()
 		}
 		gpu.Cleanup()
 		os.Exit(0)
@ -1196,7 +1201,7 @@ func streamResponse(c *gin.Context, ch chan any) {
 // ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
 func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, error) {
 	encode := func(s string) ([]int, error) {
-		return loaded.runner.Encode(ctx, s)
+		return loaded.llama.Tokenize(ctx, s)
 	}
 	prompt, err := ChatPrompt(template, messages, numCtx, encode)
@ -1326,9 +1331,8 @@ func ChatHandler(c *gin.Context) {
 	go func() {
 		defer close(ch)
-		fn := func(r llm.PredictResult) {
+		fn := func(r llm.CompletionResponse) {
 			// Update model expiration
 			loaded.expireAt = time.Now().Add(sessionDuration)
 			loaded.expireTimer.Reset(sessionDuration)
 			resp := api.ChatResponse{
@ -1352,14 +1356,12 @@ func ChatHandler(c *gin.Context) {
 			ch <- resp
 		}
-		// Start prediction
+		if err := loaded.llama.Completion(c.Request.Context(), llm.CompletionRequest{
 		predictReq := llm.PredictOpts{
 			Prompt:  prompt,
 			Format:  req.Format,
 			Images:  images,
 			Options: opts,
-		}
+		}, fn); err != nil {
 		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
--- a/server/routes_test.go
+++ b/server/routes_test.go
@ -17,7 +17,6 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/version"
 )
@ -211,7 +210,7 @@ func Test_Routes(t *testing.T) {
 		},
 	}
-	s := Server{}
+	s := &Server{}
 	router := s.GenerateRoutes()
 	httpSrv := httptest.NewServer(router)
@ -242,27 +241,3 @@ func Test_Routes(t *testing.T) {
 	}
 }
 type MockLLM struct {
 	encoding []int
 }
 func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error {
 	return nil
 }
 func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) {
 	return llm.encoding, nil
 }
 func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) {
 	return "", nil
 }
 func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) {
 	return []float64{}, nil
 }
 func (llm *MockLLM) Close() {
 	// do nothing
 }
`@ -1,3 +1,3 @@`
	`package generate`	`package generate`

	`//go:generate sh ./gen_darwin.sh`	`//go:generate bash ./gen_darwin.sh`