Adapted rocm support to cgo based llama.cpp

2023-11-29 11:00:37 -08:00 · 2023-11-29 11:00:37 -08:00 · 35934b2e05
commit 35934b2e05
parent f8ef4439e9
37 changed files with 1688 additions and 658 deletions
--- a/6
+++ b/6
@ -11,8 +11,8 @@ RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
 COPY . .
 ENV GOARCH=$TARGETARCH
 ENV GOFLAGS=$GOFLAGS
-RUN /usr/local/go/bin/go generate -tags cuda ./... \
+RUN /usr/local/go/bin/go generate ./... \
-    && /usr/local/go/bin/go build -tags cuda .
+    && /usr/local/go/bin/go build .
 FROM ubuntu:22.04
 RUN apt-get update && apt-get install -y ca-certificates
@ -27,5 +27,3 @@ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
--- a/Dockerfile.build
+++ b/Dockerfile.build
@ -1,19 +1,44 @@
-# centos7 amd64 dependencies
+# Ubuntu 20.04 amd64 dependencies
-FROM --platform=linux/amd64 nvidia/cuda:11.3.1-devel-centos7 AS base-amd64
+FROM --platform=linux/amd64 nvidia/cuda:11.7.1-devel-ubuntu22.04 AS base-amd64
-RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
+# ROCm only supports amd64
-    yum update -y && \
+ARG ROCM_VERSION=5.7
-    yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
+# Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
-RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
+RUN apt-get update && \
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+    apt-get install -y wget && \
    wget "https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \
    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
    mkdir --parents --mode=0755 /etc/apt/keyrings && \
    wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \
    echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \
    echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \
    echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \
    apt-get update && \
    DEBIAN_FRONTEND=noninteractive apt-get install -y rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev
-# centos8 arm64 dependencies
+ENV ROCM_PATH=/opt/rocm
-FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64
+
-RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
+# Ubuntu 22.04 arm64 dependencies
-RUN yum install -y git cmake
+FROM --platform=linux/arm64 nvidia/cuda:11.7.1-devel-ubuntu22.04 AS base-arm64
 RUN apt-get update && \
    apt-get install -y wget && \
    wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \
    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr
 FROM base-${TARGETARCH}
 ARG TARGETARCH
 ARG GOFLAGS="'-ldflags -w -s'"
 ARG CGO_CFLAGS
 ARG CLBLAST_VER=1.6.1
 # Common toolchain
 RUN apt-get update && \
    DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-11 g++-11 cpp-11 git ocl-icd-opencl-dev && \
    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 --slave /usr/bin/g++ g++ /usr/bin/g++-11 --slave /usr/bin/gcov gcov /usr/bin/gcov-11
 # CLBlast
 RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \
    cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install
 # install go
 ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
@ -26,6 +51,7 @@ COPY . .
 ENV GOOS=linux
 ENV GOARCH=$TARGETARCH
 ENV GOFLAGS=$GOFLAGS
 ENV CGO_CFLAGS=${CGO_CFLAGS}
-RUN /usr/local/go/bin/go generate -tags cuda ./... && \
+RUN /usr/local/go/bin/go generate ./... && \
-    /usr/local/go/bin/go build -tags cuda .
+    /usr/local/go/bin/go build .
--- a/README.md
+++ b/README.md
@ -185,8 +185,6 @@ ollama list
 ## Building
 ### Generic (CPU)
 Install `cmake` and `go`:
 ```
@ -202,32 +200,36 @@ Then build the binary:
 go build .
 ```
-### CUDA (NVIDIA)
+### Linux/Windows CUDA (NVIDIA)
 *Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
-Install `cmake` and `golang` as well as [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) development and runtime packages.
+Note: at present, Ollama is optimized for GPU usage on linux, and requires the CUDA libraries at a minimum to compile even if you do not have an NVIDIA GPU.
 Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
 Then generate dependencies:
 ```
-go generate -tags cuda ./...
+go generate ./...
 ```
 Then build the binary:
 ```
-go build -tags cuda .
+go build .
 ```
-### ROCm (AMD)
+### Linux ROCm (AMD)
 *Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
 Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
 Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
 ```
-CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate -tags rocm ./...
+CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
 ```
 Then build the binary:
 ```
-go build -tags rocm
+go build .
 ```
 ROCm requires elevated privileges to access the GPU at runtime.  On most distros you can add your user account to the `render` group, or run as root.
 ### Running local builds
 Next, start the server:
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -0,0 +1,119 @@
 //go:build linux || windows
 package gpu
 /*
 #include "gpu_info.h"
 */
 import "C"
 import (
 	"fmt"
 	"log"
 	"sync"
 	"unsafe"
 	"github.com/jmorganca/ollama/api"
 )
 type handles struct {
 	cuda *C.cuda_handle_t
 	rocm *C.rocm_handle_t
 }
 var gpuMutex sync.Mutex
 var gpuHandles *handles = nil
 // Note: gpuMutex must already be held
 func initGPUHandles() {
 	log.Printf("Detecting GPU type")
 	gpuHandles = &handles{nil, nil}
 	var resp C.cuda_init_resp_t
 	C.cuda_init(&resp)
 	if resp.err != nil {
 		log.Printf("CUDA not detected: %s", C.GoString(resp.err))
 		C.free(unsafe.Pointer(resp.err))
 		var resp C.rocm_init_resp_t
 		C.rocm_init(&resp)
 		if resp.err != nil {
 			log.Printf("ROCm not detected: %s", C.GoString(resp.err))
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			log.Printf("Radeon GPU detected")
 			rocm := resp.rh
 			gpuHandles.rocm = &rocm
 		}
 	} else {
 		log.Printf("Nvidia GPU detected")
 		cuda := resp.ch
 		gpuHandles.cuda = &cuda
 	}
 }
 func GetGPUInfo() GpuInfo {
 	// TODO - consider exploring lspci (and equivalent on windows) to check for
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
 	gpuMutex.Lock()
 	defer gpuMutex.Unlock()
 	if gpuHandles == nil {
 		initGPUHandles()
 	}
 	var memInfo C.mem_info_t
 	var resp GpuInfo
 	if gpuHandles.cuda != nil {
 		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
 		resp.Driver = "CUDA"
 	} else if gpuHandles.rocm != nil {
 		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
 		resp.Driver = "ROCM"
 	} else {
 		C.cpu_check_ram(&memInfo)
 		resp.Driver = "CPU"
 	}
 	if memInfo.err != nil {
 		log.Printf("error looking up GPU memory: %s", C.GoString(memInfo.err))
 		C.free(unsafe.Pointer(memInfo.err))
 	}
 	resp.FreeMemory = uint64(memInfo.free)
 	resp.TotalMemory = uint64(memInfo.total)
 	return resp
 }
 func CheckVRAM() (int64, error) {
 	gpuInfo := GetGPUInfo()
 	if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" {
 		return int64(gpuInfo.FreeMemory), nil
 	}
 	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
 }
 func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 	if opts.NumGPU != -1 {
 		return opts.NumGPU
 	}
 	info := GetGPUInfo()
 	if info.Driver == "CPU" {
 		return 0
 	}
 	/*
 		Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
 		We can store the model weights and the kv cache in vram,
 		to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
 	*/
 	bytesPerLayer := uint64(fileSizeBytes / numLayer)
 	// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
 	layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
 	// TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU
 	// if int64(layers) < numLayer {
 	// 	log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024))
 	// 	return 0
 	// }
 	log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", info.FreeMemory/(1024*1024), layers, numLayer)
 	return layers
 }
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@ -1,7 +1,8 @@
 //go:build darwin
-package llm
+package gpu
 import "C"
 import (
 	"github.com/jmorganca/ollama/api"
 )
@ -9,11 +10,25 @@ import (
 // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
 func CheckVRAM() (int64, error) {
 	// TODO - assume metal, and return free memory?
-	return 0, errNvidiaSMI
+	return 0, nil
 }
 func GetGPUInfo() GpuInfo {
 	// TODO - Metal vs. x86 macs...
 	return GpuInfo{
 		Driver:      "METAL",
 		TotalMemory: 0,
 		FreeMemory:  0,
 	}
 }
 func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 	// default to enable metal on macOS
 	return 1
 }
 func nativeInit() error {
 	return nil
 }
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@ -0,0 +1,49 @@
 #ifndef __APPLE__
 #ifndef __GPU_INFO_H__
 #define __GPU_INFO_H__
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #ifndef _WIN32
 #include <dlfcn.h>
 #define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
 #define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
 #define LOAD_ERR() dlerror()
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
 #else
 #include <windows.h>
 #define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
 #define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
 #define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
 // TODO - refactor this with proper error message handling on windows
 inline static char *LOAD_ERR() {
  static char errbuf[8];
  snprintf(errbuf, 8, "0x%lx", GetLastError());
  return errbuf;
 }
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct mem_info {
  uint64_t total;
  uint64_t free;
  char *err;  // If non-nill, caller responsible for freeing
 } mem_info_t;
 void cpu_check_ram(mem_info_t *resp);
 #ifdef __cplusplus
 }
 #endif
 #include "gpu_info_cuda.h"
 #include "gpu_info_rocm.h"
 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@ -0,0 +1,42 @@
 #include "gpu_info.h"
 // Fallbacks for CPU mode
 #ifdef _WIN32
 #include <sysinfoapi.h>
 void cpu_check_ram(mem_info_t *resp) {
  resp->err = NULL;
  MEMORYSTATUSEX info;
  if (GlobalMemoryStatusEx(&info) != 0) {
    resp->total = info.ullTotalPhys;
    resp->free = info.ullAvailPhys;
  } else {
    resp->err = strdup(LOAD_ERR());
  }
  return;
 }
 #elif __linux__
 #include <errno.h>
 #include <string.h>
 #include <sys/sysinfo.h>
 void cpu_check_ram(mem_info_t *resp) {
  struct sysinfo info;
  resp->err = NULL;
  if (sysinfo(&info) != 0) {
    resp->err = strdup(strerror(errno));
  } else {
    resp->total = info.totalram * info.mem_unit;
    resp->free = info.freeram * info.mem_unit;
  }
  return;
 }
 #elif __APPLE__
 // TODO consider an Apple implementation that does something useful
 // mem_info_t cpu_check_ram() {
 //   mem_info_t resp = {0, 0, NULL};
 //   return resp;
 // }
 #else
 #error "Unsupported platform"
 #endif
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@ -0,0 +1,110 @@
 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
 #include "gpu_info_cuda.h"
 #include <string.h>
 #ifndef _WIN32
 const char *cuda_lib_paths[] = {
    "libnvidia-ml.so",
    "/usr/local/cuda/lib64/libnvidia-ml.so",
    NULL,
 };
 #else
 const char *cuda_lib_paths[] = {
    "nvml.dll",
    "",
    NULL,
 };
 #endif
 void cuda_init(cuda_init_resp_t *resp) {
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  struct lookup {
    char *s;
    void **p;
  } l[4] = {
      {"nvmlInit_v2", (void *)&resp->ch.initFn},
      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
  };
  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
    resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
  }
  if (!resp->ch.handle) {
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
             cuda_lib_paths[0], LOAD_ERR());
    resp->err = strdup(buf);
    return;
  }
  for (i = 0; i < 4; i++) {  // TODO - fix this to use a null terminated list
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
               LOAD_ERR());
      resp->err = strdup(buf);
      return;
    }
  }
  return;
 }
 void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  nvmlDevice_t device;
  nvmlMemory_t memInfo = {0};
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  if (h.handle == NULL) {
    resp->err = strdup("nvml handle sn't initialized");
    return;
  }
  ret = (*h.initFn)();
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  // TODO - handle multiple GPUs
  ret = (*h.getHandle)(0, &device);
  if (ret != NVML_SUCCESS) {
    (*h.shutdownFn)();
    snprintf(buf, buflen, "unable to get device handle: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  ret = (*h.getMemInfo)(device, &memInfo);
  if (ret != NVML_SUCCESS) {
    (*h.shutdownFn)();
    snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  resp->total = memInfo.total;
  resp->free = memInfo.free;
  ret = (*h.shutdownFn)();
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "nvml vram shutdown failure: %d", ret);
    resp->err = strdup(buf);
  }
  return;
 }
 #endif  // __APPLE__
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@ -0,0 +1,35 @@
 #ifndef __APPLE__
 #ifndef __GPU_INFO_CUDA_H__
 #define __GPU_INFO_CUDA_H__
 #include "gpu_info.h"
 // Just enough typedef's to dlopen/dlsym for memory information
 typedef enum nvmlReturn_enum {
  NVML_SUCCESS = 0,
  // Other values omitted for now...
 } nvmlReturn_t;
 typedef void *nvmlDevice_t;  // Opaque is sufficient
 typedef struct nvmlMemory_st {
  unsigned long long total;
  unsigned long long free;
  unsigned long long used;
 } nvmlMemory_t;
 typedef struct cuda_handle {
  void *handle;
  nvmlReturn_t (*initFn)(void);
  nvmlReturn_t (*shutdownFn)(void);
  nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
  nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
 } cuda_handle_t;
 typedef struct cuda_init_resp {
  char *err;  // If err is non-null handle is invalid
  cuda_handle_t ch;
 } cuda_init_resp_t;
 void cuda_init(cuda_init_resp_t *resp);
 void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
 #endif  // __GPU_INFO_CUDA_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@ -0,0 +1,111 @@
 #ifndef __APPLE__
 #include "gpu_info_rocm.h"
 #include <string.h>
 #ifndef _WIN32
 const char *rocm_lib_paths[] = {
    "librocm_smi64.so",
    "/opt/rocm/lib/librocm_smi64.so",
    NULL,
 };
 #else
 // TODO untested
 const char *rocm_lib_paths[] = {
    "rocm_smi64.dll",
    "/opt/rocm/lib/rocm_smi64.dll",
    NULL,
 };
 #endif
 void rocm_init(rocm_init_resp_t *resp) {
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  struct lookup {
    char *s;
    void **p;
  } l[4] = {
      {"rsmi_init", (void *)&resp->rh.initFn},
      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
  };
  for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
    resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
  }
  if (!resp->rh.handle) {
    snprintf(buf, buflen,
             "Unable to load %s library to query for Radeon GPUs: %s\n",
             rocm_lib_paths[0], LOAD_ERR());
    resp->err = strdup(buf);
    return;
  }
  for (i = 0; i < 4; i++) {
    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->rh.handle);
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
               LOAD_ERR());
      resp->err = strdup(buf);
      return;
    }
  }
  return;
 }
 void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  // uint32_t num_devices;
  // uint16_t device;
  uint64_t totalMem = 0;
  uint64_t usedMem = 0;
  rsmi_status_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  ret = (*h.initFn)(0);
  if (ret != RSMI_STATUS_SUCCESS) {
    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  // TODO - iterate through devices...  ret =
  // rsmi_num_monitor_devices(&num_devices);
  // ret = (*h.getHandle)(0, &device);
  // if (ret != RSMI_STATUS_SUCCESS) {
  //     printf("rocm vram device lookup failure: %d\n", ret);
  //     return -1;
  // }
  // Get total memory - used memory for available memory
  ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
  if (ret != RSMI_STATUS_SUCCESS) {
    (*h.shutdownFn)();
    snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
  if (ret != RSMI_STATUS_SUCCESS) {
    (*h.shutdownFn)();
    snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  (*h.shutdownFn)();
  resp->total = totalMem;
  resp->free = totalMem - usedMem;
  return;
 }
 #endif  // __APPLE__
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@ -0,0 +1,36 @@
 #ifndef __APPLE__
 #ifndef __GPU_INFO_ROCM_H__
 #define __GPU_INFO_ROCM_H__
 #include "gpu_info.h"
 // Just enough typedef's to dlopen/dlsym for memory information
 typedef enum rsmi_status_return {
  RSMI_STATUS_SUCCESS = 0,
  // Other values omitted for now...
 } rsmi_status_t;
 typedef enum rsmi_memory_type {
  RSMI_MEM_TYPE_VRAM = 0,
  RSMI_MEM_TYPE_VIS_VRAM,
  RSMI_MEM_TYPE_GTT,
 } rsmi_memory_type_t;
 typedef struct rocm_handle {
  void *handle;
  rsmi_status_t (*initFn)(uint64_t);
  rsmi_status_t (*shutdownFn)(void);
  rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
  rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
  // rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
 } rocm_handle_t;
 typedef struct rocm_init_resp {
  char *err;  // If err is non-null handle is invalid
  rocm_handle_t rh;
 } rocm_init_resp_t;
 void rocm_init(rocm_init_resp_t *resp);
 void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
 #endif  // __GPU_INFO_ROCM_H__
 #endif  // __APPLE__
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@ -0,0 +1,26 @@
 package gpu
 import (
 	"runtime"
 	"testing"
 	"github.com/stretchr/testify/assert"
 )
 func TestBasicGetGPUInfo(t *testing.T) {
 	info := GetGPUInfo()
 	assert.Contains(t, "CUDA ROCM CPU METAL", info.Driver)
 	switch runtime.GOOS {
 	case "darwin":
 		// TODO - remove this once MacOS returns some size for CPU
 		return
 	case "linux", "windows":
 		assert.Greater(t, info.TotalMemory, uint64(0))
 		assert.Greater(t, info.FreeMemory, uint64(0))
 	default:
 		return
 	}
 }
 // TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/gpu/types.go
+++ b/gpu/types.go
@ -0,0 +1,10 @@
 package gpu
 // Beginning of an `ollama info` command
 type GpuInfo struct {
 	Driver      string `json:"driver,omitempty"`
 	TotalMemory uint64 `json:"total_memory,omitempty"`
 	FreeMemory  uint64 `json:"free_memory,omitempty"`
 	// TODO add other useful attributes about the card here for discovery information
 }
--- a/llm/accelerator_cuda.go
+++ b/llm/accelerator_cuda.go
@ -1,67 +0,0 @@
 //go:build cuda
 package llm
 import (
 	"bufio"
 	"bytes"
 	"errors"
 	"fmt"
 	"log"
 	"os/exec"
 	"path"
 	"strconv"
 	"strings"
 	"github.com/jmorganca/ollama/format"
 )
 var (
 	errNvidiaSMI     = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
 	errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
 )
 // acceleratedRunner returns the runner for this accelerator given the provided buildPath string.
 func acceleratedRunner(buildPath string) []ModelRunner {
 	return []ModelRunner{
 		ModelRunner{
 			Path:        path.Join(buildPath, "cuda", "bin", "ollama-runner"),
 			Accelerated: true,
 		},
 	}
 }
 // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
 func CheckVRAM() (int64, error) {
 	cmd := exec.Command("nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits")
 	var stdout bytes.Buffer
 	cmd.Stdout = &stdout
 	err := cmd.Run()
 	if err != nil {
 		return 0, errNoAccel
 	}
 	var freeMiB int64
 	scanner := bufio.NewScanner(&stdout)
 	for scanner.Scan() {
 		line := scanner.Text()
 		if strings.Contains(line, "[Insufficient Permissions]") {
 			return 0, fmt.Errorf("GPU support may not enabled, check you have installed GPU drivers and have the necessary permissions to run nvidia-smi")
 		}
 		vram, err := strconv.ParseInt(strings.TrimSpace(line), 10, 64)
 		if err != nil {
 			return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
 		}
 		freeMiB += vram
 	}
 	freeBytes := freeMiB * 1024 * 1024
 	if freeBytes < 2*format.GigaByte {
 		log.Printf("less than 2 GB VRAM available")
 		return 0, errAvailableVRAM
 	}
 	return freeBytes, nil
 }
--- a/llm/accelerator_none.go
+++ b/llm/accelerator_none.go
@ -1,21 +0,0 @@
 //go:build !rocm && !cuda
 package llm
 import (
 	"errors"
 )
 var (
 	errNoAccel = errors.New("no accelerator support in this binary")
 )
 // acceleratedRunner returns the runner for this accelerator given the provided buildPath string.
 func acceleratedRunner(buildPath string) []ModelRunner {
 	return make([]ModelRunner, 0, 1)
 }
 // CheckVRAM is a stub with no accelerator.
 func CheckVRAM() (int64, error) {
 	return 0, errNoGPU
 }
--- a/llm/accelerator_rocm.go
+++ b/llm/accelerator_rocm.go
@ -1,85 +0,0 @@
 //go:build rocm
 package llm
 import (
 	"bytes"
 	"encoding/csv"
 	"errors"
 	"fmt"
 	"io"
 	"log"
 	"os"
 	"os/exec"
 	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
 )
 var errNoAccel = errors.New("rocm-smi command failed")
 // acceleratedRunner returns the runner for this accelerator given the provided buildPath string.
 func acceleratedRunner(buildPath string) []ModelRunner {
 	return []ModelRunner{
 		ModelRunner{
 			Path:        path.Join(buildPath, "rocm", "bin", "ollama-runner"),
 			Accelerated: true,
 		},
 	}
 }
 // CheckVRAM returns the available VRAM in MiB on Linux machines with AMD GPUs
 func CheckVRAM() (int64, error) {
 	rocmHome := os.Getenv("ROCM_PATH")
 	if rocmHome == "" {
 		rocmHome = os.Getenv("ROCM_HOME")
 	}
 	if rocmHome == "" {
 		log.Println("warning: ROCM_PATH is not set. Trying a likely fallback path, but it is recommended to set this variable in the environment.")
 		rocmHome = "/opt/rocm"
 	}
 	cmd := exec.Command(filepath.Join(rocmHome, "bin/rocm-smi"), "--showmeminfo", "VRAM", "--csv")
 	var stdout bytes.Buffer
 	cmd.Stdout = &stdout
 	err := cmd.Run()
 	if err != nil {
 		return 0, errNoAccel
 	}
 	csvData := csv.NewReader(&stdout)
 	// llama.cpp or ROCm don't seem to understand splitting the VRAM allocations across them properly, so try to find the biggest card instead :(. FIXME.
 	totalBiggestCard := int64(0)
 	bigCardName := ""
 	for {
 		record, err := csvData.Read()
 		if err == io.EOF {
 			break
 		}
 		if err != nil {
 			return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
 		}
 		if !strings.HasPrefix(record[0], "card") {
 			continue
 		}
 		cardTotal, err := strconv.ParseInt(record[1], 10, 64)
 		if err != nil {
 			return 0, err
 		}
 		cardUsed, err := strconv.ParseInt(record[2], 10, 64)
 		if err != nil {
 			return 0, err
 		}
 		possible := (cardTotal - cardUsed)
 		log.Printf("ROCm found %d MiB of available VRAM on device %q", possible/1024/1024, record[0])
 		if possible > totalBiggestCard {
 			totalBiggestCard = possible
 			bigCardName = record[0]
 		}
 	}
 	if totalBiggestCard == 0 {
 		log.Printf("found ROCm GPU but failed to parse free VRAM!")
 		return 0, errNoAccel
 	}
 	log.Printf("ROCm selecting device %q", bigCardName)
 	return totalBiggestCard, nil
 }
--- a/llm/ext_server.go
+++ b/llm/ext_server.go
@ -1,7 +1,7 @@
 package llm
 /*
-#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common
+#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
 #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 #cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
 #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
@ -25,6 +25,8 @@ package llm
 #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a
 #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a
 #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a
 // Note: the following requires cuda library presence on linux to build, even if you only have rocm or CPU only
 #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a
 #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a
 #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a
@ -35,7 +37,7 @@ package llm
 #cgo windows LDFLAGS: -lext_server_shared -lpthread
 #include <stdlib.h>
-#include "examples/server/server.h"
+#include "server.h"
 */
 import "C"
@ -43,25 +45,51 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"log"
 	"os"
 	"runtime"
 	"strings"
 	"sync"
 	"time"
 	"unsafe"
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/gpu"
 )
-func errWrap(resp C.ext_server_err) error {
+func newExtServerResp(len C.size_t) C.ext_server_resp_t {
-	if resp.code == 0 {
+	var resp C.ext_server_resp_t
-		return nil
+	resp.msg_len = len
 	bytes := make([]byte, len)
 	resp.msg = (*C.char)(C.CBytes(bytes))
 	return resp
 }
 func freeExtServerResp(resp C.ext_server_resp_t) {
 	if resp.msg_len == 0 {
 		return
 	}
-	err := fmt.Errorf(C.GoString(resp.err))
+	C.free(unsafe.Pointer(resp.msg))
-	C.free(unsafe.Pointer(resp.err))
+}
-	return err
+
 func extServerResponseToErr(resp C.ext_server_resp_t) error {
 	return fmt.Errorf(C.GoString(resp.msg))
 }
 type extServer interface {
 	LLM
 	llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
 	llama_server_start()
 	llama_server_stop()
 	llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
 	llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
 	llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
 	llama_server_release_task_result(result *C.ext_server_task_result_t)
 	llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
 	llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
 	llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
 	llama_server_release_json_resp(json_resp **C.char)
 }
 type llamaExtServer struct {
@ -71,21 +99,61 @@ type llamaExtServer struct {
 // Note: current implementation does not support concurrent instantiations
 var mutex sync.Mutex
-func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (*llamaExtServer, error) {
+func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
 	C.llama_server_init(sparams, err)
 }
 func (llm *llamaExtServer) llama_server_start() {
 	C.llama_server_start()
 }
 func (llm *llamaExtServer) llama_server_stop() {
 	C.llama_server_stop()
 }
 func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
 	C.llama_server_completion(json_req, resp)
 }
 func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
 	C.llama_server_completion_next_result(task_id, resp)
 }
 func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
 	C.llama_server_completion_cancel(task_id, err)
 }
 func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
 	C.llama_server_release_task_result(result)
 }
 func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
 	C.llama_server_tokenize(json_req, json_resp, err)
 }
 func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
 	C.llama_server_detokenize(json_req, json_resp, err)
 }
 func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
 	C.llama_server_embedding(json_req, json_resp, err)
 }
 func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
 	C.llama_server_release_json_resp(json_resp)
 }
 func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
 	server := &llamaExtServer{opts}
 	return newExtServer(server, model, adapters, projectors, numLayers, opts)
 }
 func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
 	if !mutex.TryLock() {
 		log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
 		mutex.Lock()
 	}
 	server := &llamaExtServer{opts}
 	fileInfo, err := os.Stat(model)
 	if err != nil {
 		return nil, err
 	}
-	var sparams C.ext_server_params
+	var sparams C.ext_server_params_t
 	sparams.model = C.CString(model)
 	defer C.free(unsafe.Pointer(sparams.model))
-	numGPU := NumGPU(numLayers, fileInfo.Size(), opts)
+	numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
 	sparams.embedding = true
 	sparams.n_ctx = C.uint(opts.NumCtx)
@ -97,10 +165,14 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in
 	// Always use the value encoded in the model
 	sparams.rope_freq_base = 0.0
 	sparams.rope_freq_scale = 0.0
 	sparams.memory_f16 = C.bool(opts.F16KV)
 	sparams.use_mlock = C.bool(opts.UseMLock)
 	sparams.use_mmap = C.bool(opts.UseMMap)
 	sparams.numa = C.bool(opts.UseNUMA)
 	sparams.lora_adapters = nil
 	for i := 0; i < len(adapters); i++ {
-		la := (*C.ext_server_lora_adapter)(C.malloc(C.sizeof_struct_ext_server_lora_adapter))
+		la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
 		defer C.free(unsafe.Pointer(la))
 		la.adapter = C.CString(adapters[i])
 		defer C.free(unsafe.Pointer(la.adapter))
@ -116,11 +188,13 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in
 		}
 	}
-	// TODO - implement ME
+	if len(projectors) > 0 {
-	// if len(projectors) > 0 {
+		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
-	// 	// TODO: applying multiple projectors is not supported by the llama.cpp server yet
+		sparams.mmproj = C.CString(projectors[0])
-	// 	params = append(params, "--mmproj", projectors[0])
+		defer C.free(unsafe.Pointer(sparams.mmproj))
-	// }
+	} else {
 		sparams.mmproj = nil
 	}
 	if opts.NumThread > 0 {
 		sparams.n_threads = C.uint(opts.NumThread)
@ -128,136 +202,167 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in
 		sparams.n_threads = C.uint(runtime.NumCPU())
 	}
 	sparams.memory_f16 = false
 	if opts.F16KV {
 		sparams.memory_f16 = true
 	}
 	sparams.use_mlock = false
 	if opts.UseMLock {
 		sparams.use_mlock = true
 	}
 	sparams.use_mmap = true
 	if !opts.UseMMap {
 		sparams.use_mmap = false
 	}
 	sparams.numa = false
 	if opts.UseNUMA {
 		sparams.numa = true
 	}
 	log.Printf("Initializing internal llama server")
-	err = errWrap(C.llama_server_init(&sparams))
+	resp := newExtServerResp(128)
-	if err != nil {
+	defer freeExtServerResp(resp)
-		return nil, err
+	server.llama_server_init(&sparams, &resp)
 	if resp.id < 0 {
 		return nil, extServerResponseToErr(resp)
 	}
 	log.Printf("Starting internal llama main loop")
-	C.llama_server_start()
+	server.llama_server_start()
 	return server, nil
 }
-func (llm *llamaExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
+func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
 	return predict(llm, llm.Options, ctx, pred, fn)
 }
 func predict(llm extServer, opts api.Options, ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	var imageData []ImageData
 	if len(predict.Images) > 0 {
 		for cnt, i := range predict.Images {
 			imageData = append(imageData, ImageData{Data: i, ID: cnt})
 		}
 	}
 	log.Printf("loaded %d images", len(imageData))
 	request := map[string]any{
 		"prompt":            predict.Prompt,
 		"stream":            true,
-		"n_predict":         llm.NumPredict,
+		"n_predict":         opts.NumPredict,
-		"n_keep":            llm.NumKeep,
+		"n_keep":            opts.NumKeep,
-		"temperature":       llm.Temperature,
+		"temperature":       opts.Temperature,
-		"top_k":             llm.TopK,
+		"top_k":             opts.TopK,
-		"top_p":             llm.TopP,
+		"top_p":             opts.TopP,
-		"tfs_z":             llm.TFSZ,
+		"tfs_z":             opts.TFSZ,
-		"typical_p":         llm.TypicalP,
+		"typical_p":         opts.TypicalP,
-		"repeat_last_n":     llm.RepeatLastN,
+		"repeat_last_n":     opts.RepeatLastN,
-		"repeat_penalty":    llm.RepeatPenalty,
+		"repeat_penalty":    opts.RepeatPenalty,
-		"presence_penalty":  llm.PresencePenalty,
+		"presence_penalty":  opts.PresencePenalty,
-		"frequency_penalty": llm.FrequencyPenalty,
+		"frequency_penalty": opts.FrequencyPenalty,
-		"mirostat":          llm.Mirostat,
+		"mirostat":          opts.Mirostat,
-		"mirostat_tau":      llm.MirostatTau,
+		"mirostat_tau":      opts.MirostatTau,
-		"mirostat_eta":      llm.MirostatEta,
+		"mirostat_eta":      opts.MirostatEta,
-		"penalize_nl":       llm.PenalizeNewline,
+		"penalize_nl":       opts.PenalizeNewline,
-		"seed":              llm.Seed,
+		"seed":              opts.Seed,
-		"stop":              llm.Stop,
+		"stop":              opts.Stop,
 		"image_data":        imageData,
 	}
 	if predict.Format == "json" {
 		request["grammar"] = jsonGrammar
 	}
-	// Handling JSON marshaling with special characters unescaped.
+	retryDelay := 100 * time.Microsecond
-	buffer := &bytes.Buffer{}
+	for retries := 0; retries < maxRetries; retries++ {
-	enc := json.NewEncoder(buffer)
+		if retries > 0 {
-	enc.SetEscapeHTML(false)
+			time.Sleep(retryDelay) // wait before retrying
 			retryDelay *= 2        // exponential backoff
 		}
-	if err := enc.Encode(request); err != nil {
+		// Handling JSON marshaling with special characters unescaped.
-		return fmt.Errorf("failed to marshal data: %w", err)
+		buffer := &bytes.Buffer{}
-	}
+		enc := json.NewEncoder(buffer)
 		enc.SetEscapeHTML(false)
-	req := C.CString(buffer.String())
+		if err := enc.Encode(request); err != nil {
-	defer C.free(unsafe.Pointer(req))
+			return fmt.Errorf("failed to marshal data: %w", err)
 		}
-	cmpCtx := C.llama_server_completion(req)
+		req := C.CString(buffer.String())
-	if cmpCtx.task_id < 0 {
+		defer C.free(unsafe.Pointer(req))
 		defer C.free(unsafe.Pointer(cmpCtx.err))
 		return fmt.Errorf(C.GoString(cmpCtx.err))
 	}
-	for {
+		llm.llama_server_completion(req, &resp)
-		select {
+		if resp.id < 0 {
-		case <-ctx.Done():
+			return extServerResponseToErr(resp)
-			// This handles the request cancellation
+		}
 			return errWrap(C.llama_server_completion_cancel(cmpCtx.task_id))
 		default:
 			result := C.llama_server_completion_next_result(cmpCtx.task_id)
 			if result.result_json != nil {
 				defer C.free(unsafe.Pointer(result.result_json))
 			}
 			var p prediction
 			if err := json.Unmarshal([]byte(C.GoString(result.result_json)), &p); err != nil {
 				err2 := errWrap(C.llama_server_completion_cancel(cmpCtx.task_id))
 				return errors.Join(fmt.Errorf("error unmarshaling llm prediction response: %w", err), err2)
 			}
-			if p.Content != "" {
+		retryNeeded := false
-				fn(PredictResult{
+	out:
-					// Model:     predict.Model, // XXX remove or replace?
+		for {
-					CreatedAt: time.Now().UTC(),
+			select {
-					Content:   p.Content,
+			case <-ctx.Done():
-				})
+				// This handles the request cancellation
-			}
+				llm.llama_server_completion_cancel(resp.id, &resp)
 				if resp.id < 0 {
 					return extServerResponseToErr(resp)
 				} else {
 					return nil
 				}
 			default:
 				var result C.ext_server_task_result_t
 				llm.llama_server_completion_next_result(resp.id, &result)
 				json_resp := C.GoString(result.json_resp)
 				llm.llama_server_release_task_result(&result)
-			if p.Stop {
+				var p prediction
-				fn(PredictResult{
+				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
-					// Model:              predict.Model, // XXX remove or replace?
+					llm.llama_server_completion_cancel(resp.id, &resp)
-					CreatedAt:          time.Now().UTC(),
+					if resp.id < 0 {
-					TotalDuration:      time.Since(predict.CheckpointStart),
+						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
-					Done:               true,
+					} else {
-					PromptEvalCount:    p.Timings.PromptN,
+						return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
-					PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
+					}
-					EvalCount:          p.Timings.PredictedN,
+				}
-					EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
+
-				})
+				if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
-				return nil
+					retryNeeded = true
 					// task will already be canceled
 					break out
 				}
 				if p.Content != "" {
 					fn(PredictResult{
 						Content: p.Content,
 					})
 				}
 				if p.Stop {
 					fn(PredictResult{
 						Done:               true,
 						PromptEvalCount:    p.Timings.PromptN,
 						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
 						EvalCount:          p.Timings.PredictedN,
 						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
 					})
 					return nil
 				}
 			}
 		}
 		if !retryNeeded {
 			return nil // success
 		}
 	}
 	// should never reach here ideally
 	return fmt.Errorf("max retries exceeded")
 }
 func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
 	return encode(llm, ctx, prompt)
 }
-func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
+func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: prompt})
 	if err != nil {
 		return nil, fmt.Errorf("marshaling encode data: %w", err)
 	}
 	req := C.CString(string(data))
 	defer C.free(unsafe.Pointer(req))
-	var resp C.ext_server_resp
+	var json_resp *C.char
-	err = errWrap(C.llama_server_tokenize(req, &resp))
+	resp := newExtServerResp(128)
-	if resp.json_resp != nil {
+	defer freeExtServerResp(resp)
-		defer C.free(unsafe.Pointer(resp.json_resp))
+	llm.llama_server_tokenize(req, &json_resp, &resp)
 	if resp.id < 0 {
 		return nil, extServerResponseToErr(resp)
 	}
 	defer llm.llama_server_release_json_resp(&json_resp)
 	var encoded TokenizeResponse
-	if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &encoded); err2 != nil {
+	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
 		return nil, fmt.Errorf("unmarshal encode response: %w", err2)
 	}
@ -265,6 +370,10 @@ func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, er
 }
 func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
 	return decode(llm, ctx, tokens)
 }
 func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
 	if len(tokens) == 0 {
 		return "", nil
 	}
@ -275,14 +384,17 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er
 	req := C.CString(string(data))
 	defer C.free(unsafe.Pointer(req))
-	var resp C.ext_server_resp
+	var json_resp *C.char
-	err = errWrap(C.llama_server_detokenize(req, &resp))
+	resp := newExtServerResp(128)
-	if resp.json_resp != nil {
+	defer freeExtServerResp(resp)
-		defer C.free(unsafe.Pointer(resp.json_resp))
+	llm.llama_server_detokenize(req, &json_resp, &resp)
 	if resp.id < 0 {
 		return "", extServerResponseToErr(resp)
 	}
 	defer llm.llama_server_release_json_resp(&json_resp)
 	var decoded DetokenizeResponse
-	if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &decoded); err2 != nil {
+	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
 		return "", fmt.Errorf("unmarshal encode response: %w", err2)
 	}
@ -290,6 +402,9 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er
 }
 func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
 	return embedding(llm, ctx, input)
 }
 func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: input})
 	if err != nil {
 		return nil, fmt.Errorf("error marshaling embed data: %w", err)
@ -297,29 +412,28 @@ func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float
 	req := C.CString(string(data))
 	defer C.free(unsafe.Pointer(req))
-	var resp C.ext_server_resp
+	var json_resp *C.char
-	err = errWrap(C.llama_server_embedding(req, &resp))
+	resp := newExtServerResp(128)
-	if resp.json_resp != nil {
+	defer freeExtServerResp(resp)
-		defer C.free(unsafe.Pointer(resp.json_resp))
+	llm.llama_server_embedding(req, &json_resp, &resp)
-	}
+	if resp.id < 0 {
-	if err != nil {
+		return nil, extServerResponseToErr(resp)
 		return nil, err
 	}
 	defer llm.llama_server_release_json_resp(&json_resp)
 	var embedding EmbeddingResponse
-	if err := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &embedding); err != nil {
+	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
 		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}
 	return embedding.Embedding, nil
 }
-func (llm *llamaExtServer) Ping(ctx context.Context) error {
+func (llm *llamaExtServer) Close() {
-	// TODO - consider some mechanism to check if the main loop and llama.cpp are in a good state
+	close(llm)
 	return nil
 }
-func (llm *llamaExtServer) Close() {
+func close(llm extServer) {
-	C.llama_server_stop()
+	llm.llama_server_stop()
 	mutex.Unlock()
 }
--- a/llm/gpu_cuda.go
+++ b/llm/gpu_cuda.go
@ -1,57 +0,0 @@
 //go:build linux || windows
 package llm
 import (
 	"errors"
 	"log"
 	"github.com/jmorganca/ollama/api"
 )
 /*
 #cgo windows LDFLAGS: -L"/Program Files/NVIDIA Corporation/NVSMI/"
 #cgo linux LDFLAGS: -lnvidia-ml
 #include <stdlib.h>
 #include "examples/server/server.h"
 */
 import "C"
 // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
 func CheckVRAM() (int64, error) {
 	return int64(C.check_vram()), nil
 }
 func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 	if opts.NumGPU != -1 {
 		return opts.NumGPU
 	}
 	freeBytes, err := CheckVRAM()
 	if err != nil {
 		if !errors.Is(err, errNvidiaSMI) {
 			log.Print(err.Error())
 		}
 		// nvidia driver not installed or no nvidia GPU found
 		return 0
 	}
 	/*
 		Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
 		We can store the model weights and the kv cache in vram,
 		to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
 	*/
 	bytesPerLayer := fileSizeBytes / numLayer
 	// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
 	layers := int(freeBytes/bytesPerLayer) * 3 / 4
 	// TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU
 	// if int64(layers) < numLayer {
 	// 	log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024))
 	// 	return 0
 	// }
 	log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", freeBytes/(1024*1024), layers, numLayer)
 	return layers
 }
--- a/llm/llama.cpp/gen_common.sh
+++ b/llm/llama.cpp/gen_common.sh
@ -1,10 +1,11 @@
 # common logic accross linux and darwin
 init_vars() {
    LLAMACPP_DIR=gguf
    PATCHES="0001-Expose-callable-API-for-server.patch"
    CMAKE_DEFS="-DLLAMA_ACCELERATE=on"
    # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings
-    CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server"
+    CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
    if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}"
    else
@ -29,6 +30,6 @@ apply_patches() {
 }
 build() {
-    cmake -S gguf -B ${BUILD_DIR} ${CMAKE_DEFS}
+    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
-    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 
+    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
 }
--- a/llm/llama.cpp/gen_darwin.sh
+++ b/llm/llama.cpp/gen_darwin.sh
@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # This script is intended to run inside the go generate
 # working directory must be ../llm/llama.cpp
@ -30,6 +30,7 @@ git_module_setup
 apply_patches
 build
 # TODO - improve this to handle test cases that need it to be in "." around the tree
 # Enable local debug/run usecase
 if [ -e "gguf/ggml-metal.metal" ]; then
    cp gguf/ggml-metal.metal ../../
--- a/llm/llama.cpp/gen_linux.sh
+++ b/llm/llama.cpp/gen_linux.sh
@ -1,17 +1,73 @@
-#!/bin/sh
+#!/bin/bash
 # This script is intended to run inside the go generate
 # working directory must be ../llm/llama.cpp
 set -ex
 set -o pipefail
 # TODO - stopped here - map the variables from above over and refine the case statement below
 echo "Starting linux generate script"
 if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ] ; then
    export CUDACXX=/usr/local/cuda/bin/nvcc
 fi
 source $(dirname $0)/gen_common.sh
 init_vars
 CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
 BUILD_DIR="gguf/build/cuda"
 git_module_setup
 apply_patches
 CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
 BUILD_DIR="gguf/build/cuda"
 LIB_DIR="${BUILD_DIR}/lib"
 mkdir -p ../../dist/
 build
 # TODO - explore mechanism to soften the hard cuda dependency on linux
 #        by conditionally building some archive here that aggregates the cuda libs if present
 #        so that the cgo flags link this intermediate archive instead of the underlying cuda libs
 # 
 # gcc -fPIC -g -shared -o ${LIB_DIR}/libcuda_server.so \
 #     -Wl,--whole-archive \
 #     ${BUILD_DIR}/examples/server/CMakeFiles/ext_server.dir/server.cpp.o \
 #     ${BUILD_DIR}/common/libcommon.a \
 #     ${BUILD_DIR}/libllama.a \
 #     ${BUILD_DIR}/examples/llava/libllava_static.a \
 #     -Wl,--no-whole-archive \
 #     -lrt -lpthread -ldl -lstdc++ -lm \
 #     /usr/local/cuda/lib64/libcudart_static.a \
 #     /usr/local/cuda/lib64/libcublas_static.a \
 #     /usr/local/cuda/lib64/libcublasLt_static.a \
 #     /usr/local/cuda/lib64/libcudadevrt.a \
 #     /usr/local/cuda/lib64/libculibos.a
 if [ -z "${ROCM_PATH}" ] ; then
    # Try the default location in case it exists
    ROCM_PATH=/opt/rocm
 fi
 if [ -z "${CLBlast_DIR}" ] ; then
    # Try the default location in case it exists
    if [ -d /usr/lib/cmake/CLBlast ]; then
        export CLBlast_DIR=/usr/lib/cmake/CLBlast
    fi
 fi
 BUILD_DIR="gguf/build/rocm"
 LIB_DIR="${BUILD_DIR}/lib"
 mkdir -p ${LIB_DIR}
 # Ensure we have at least one file present for the embed
 touch ${LIB_DIR}/.generated 
 if [ -d "${ROCM_PATH}" ] ; then
    echo "Building ROCm"
    init_vars
    CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
    CMAKE_DEFS="-DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
    build
    gcc -fPIC -g -shared -o ${LIB_DIR}/librocm_server.so \
        -Wl,--whole-archive \
        ${BUILD_DIR}/examples/server/libext_server.a \
        ${BUILD_DIR}/common/libcommon.a \
        ${BUILD_DIR}/libllama.a \
        -Wl,--no-whole-archive \
        -lrt -lpthread -ldl -lstdc++ -lm \
        -L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \
        -Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \
        -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu
 fi
--- a/llm/llama.cpp/gen_windows.ps1
+++ b/llm/llama.cpp/gen_windows.ps1
@ -48,4 +48,8 @@ init_vars
 git_module_setup
 apply_patches
 build
-install
+install
 # TODO - implement ROCm support on windows
 md gguf/build/winrocm/lib -ea 0
 echo $null >> gguf/build/winrocm/lib/.generated
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@ -1,3 +1,3 @@
 package llm
-//go:generate sh ./gen_linux.sh
+//go:generate bash ./gen_linux.sh
--- a/llm/llama.cpp/generate_linux_cuda.go
+++ b/llm/llama.cpp/generate_linux_cuda.go
@ -1,24 +0,0 @@
 //go:build cuda
 package llm
 //go:generate git submodule init
 //go:generate git submodule update --force ggml
 //go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
 //go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
 //go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
 //go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
 //go:generate rm -rf ggml/build/cuda
 //go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/cuda --target server --config Release
 //go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
 //go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
 //go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
 //go:generate rm -rf gguf/build/cuda
 //go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
 //go:generate cmake --build gguf/build/cuda --target server --config Release
 //go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner
--- a/llm/llama.cpp/generate_linux_rocm.go
+++ b/llm/llama.cpp/generate_linux_rocm.go
@ -1,25 +0,0 @@
 //go:build rocm
 package llm
 //go:generate git submodule init
 //go:generate git submodule update --force ggml
 //go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
 //go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
 //go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
 //go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
 //go:generate git submodule update --force gguf
 //go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
 //go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
 //go:generate rm -rf ggml/build/rocm
 //go:generate cmake -S ggml -B ggml/build/rocm -DLLAMA_CLBLAST=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/rocm --target server --config Release
 //go:generate mv ggml/build/rocm/bin/server ggml/build/rocm/bin/ollama-runner
 //go:generate rm -rf gguf/build/rocm
 //go:generate cmake -S gguf -B gguf/build/rocm -DLLAMA_HIPBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'
 //go:generate cmake --build gguf/build/rocm --target server --config Release
 //go:generate mv gguf/build/rocm/bin/server gguf/build/rocm/bin/ollama-runner
--- a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch
+++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch
@ -1,15 +1,15 @@
-From 64b3fbb150d12b3ca63ac2fb4e57bc46f41d2ccd Mon Sep 17 00:00:00 2001
+From 087cf3300e973d7790db8f7cad01d2a790de38be Mon Sep 17 00:00:00 2001
 From: Daniel Hiltgen <daniel@ollama.com>
 Date: Mon, 13 Nov 2023 12:25:58 -0800
 Subject: [PATCH] Expose callable API for server
 This adds an extern "C" interface within the example server
 ---
- examples/server/CMakeLists.txt |  24 ++++
+ examples/server/CMakeLists.txt |  24 +++
- examples/server/server.cpp     | 247 +++++++++++++++++++++++++++++++++
+ examples/server/server.cpp     | 274 +++++++++++++++++++++++++++++++++
- examples/server/server.h       |  83 +++++++++++
+ examples/server/server.h       |  89 +++++++++++
 ggml-cuda.cu                   |   1 +
- 4 files changed, 355 insertions(+)
+ 4 files changed, 388 insertions(+)
 create mode 100644 examples/server/server.h
 diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644
 +endif()
 \ No newline at end of file
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 895f751..f939590 100644
+index d0cd8e1..5f5d4c5 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
@@ -5,6 +5,9 @@
@ -59,7 +59,7 @@ index 895f751..f939590 100644
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
-@@ -2631,6 +2634,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
+@@ -2632,6 +2635,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
     }
 }
@ -67,84 +67,84 @@ index 895f751..f939590 100644
 int main(int argc, char **argv)
 {
     // own arguments required by this example
-@@ -3065,3 +3069,246 @@ int main(int argc, char **argv)
+@@ -3066,3 +3070,273 @@ int main(int argc, char **argv)
     llama_backend_free();
     return 0;
 }
 +
 +#else // LLAMA_SERVER_LIBRARY
 +// Expose the llama server as a callable extern "C" API
-+llama_server_context llama;
+llama_server_context *llama = NULL;
 +std::atomic<bool> ext_server_running(false);
 +std::thread ext_server_thread;
 +inline ext_server_err makeErr(uint32_t code, std::string msg) {
 +    if (code == 0) {
 +        return ext_server_err{0, NULL};
 +    }
 +    const std::string::size_type size = msg.size();
 +    ext_server_err ret = {
 +        code,
 +        new char[size + 1],
 +    };
 +    memcpy(ret.err, msg.c_str(), size + 1);
 +    return ret;
 +}
 +
-+ext_server_err llama_server_init(ext_server_params *sparams)
+void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err)
 +{
-+    log_set_target(stdout);
+    assert(err != NULL && sparams != NULL);
-+    gpt_params params;
+    err->id = 0;
-+    params.n_ctx = sparams->n_ctx;
+    err->msg[0] = '\0';
 +    params.n_batch = sparams->n_batch;
 +    params.n_threads = sparams->n_threads;
 +    params.n_parallel = sparams->n_parallel;
 +    params.rope_freq_base = sparams->rope_freq_base;
 +    params.rope_freq_scale = sparams->rope_freq_scale;
 +
 +    if (sparams->memory_f16)  {
 +        params.cache_type_k = "f16";
 +        params.cache_type_v = "f16";
 +    } else {
 +        params.cache_type_k = "f32";
 +        params.cache_type_v = "f32";
 +    }
 +    
 +    params.n_gpu_layers = sparams->n_gpu_layers;
 +    params.main_gpu = sparams->main_gpu;
 +    params.use_mlock = sparams->use_mlock;
 +    params.use_mmap = sparams->use_mmap;
 +    params.numa = sparams->numa;
 +    params.embedding = sparams->embedding;
 +    if (sparams->model != NULL) {
 +        params.model = sparams->model;
 +    }
 +
 +    for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; la = la->next) {
 +        params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
 +    }
 +    
 +    try {
 +        llama = new llama_server_context;
 +        log_set_target(stdout);
 +        gpt_params params;
 +        params.n_ctx = sparams->n_ctx;
 +        params.n_batch = sparams->n_batch;
 +        params.n_threads = sparams->n_threads;
 +        params.n_parallel = sparams->n_parallel;
 +        params.rope_freq_base = sparams->rope_freq_base;
 +        params.rope_freq_scale = sparams->rope_freq_scale;
 +
 +        if (sparams->memory_f16)  {
 +            params.cache_type_k = "f16";
 +            params.cache_type_v = "f16";
 +        } else {
 +            params.cache_type_k = "f32";
 +            params.cache_type_v = "f32";
 +        }
 +
 +        params.n_gpu_layers = sparams->n_gpu_layers;
 +        params.main_gpu = sparams->main_gpu;
 +        params.use_mlock = sparams->use_mlock;
 +        params.use_mmap = sparams->use_mmap;
 +        params.numa = sparams->numa;
 +        params.embedding = sparams->embedding;
 +        if (sparams->model != NULL) {
 +            params.model = sparams->model;
 +        }
 +
 +        for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; la = la->next) {
 +            params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
 +        }
 +
 +        if (sparams->mmproj != NULL) {
 +            params.mmproj = std::string(sparams->mmproj);
 +        }
 +           
 +        llama_backend_init(params.numa);
 +
 +        // load the model
-+        if (!llama.load_model(params))
+        if (!llama->load_model(params))
 +        {
 +            // TODO - consider modifying the logging logic or patching load_model so we can capture more detailed error messages
 +            // and pass them back to the caller for better UX
-+            return makeErr(1, "error loading model " + params.model);
+            err->id = -1;
 +            snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
 +            return;
 +        }
 +
-+        llama.initialize();
+        llama->initialize();
 +    } catch (std::exception &e) {
-+        return makeErr(1, e.what());
+        err->id = -1;
 +        snprintf(err->msg, err->msg_len, "exception %s", e.what());
 +    } catch (...) {
-+        return makeErr(1, "Unknown Exception initializing llama server");
+        err->id = -1;
 +        snprintf(err->msg, err->msg_len, "Unknown exception initializing llama server");
 +    }
 +    return makeErr(0, "");
 +}
 +
 +void llama_server_start()
 +{
 +    assert(llama != NULL);
 +     // TODO mutex to protect thread creation
 +    ext_server_thread = std::thread([&]()
 +    {
@ -154,7 +154,7 @@ index 895f751..f939590 100644
 +            ggml_time_init();
 +            while (ext_server_running.load())
 +            {
-+                if (!llama.update_slots()) {
+                if (!llama->update_slots()) {
 +                    LOG_TEE("unexpected error in llama server update_slots - exiting main loop\n");
 +                    break;
 +                }
@ -170,124 +170,150 @@ index 895f751..f939590 100644
 +}
 +
 +void llama_server_stop() {
 +    assert(llama != NULL);
 +    // TODO - too verbose, remove once things are solid
 +    LOG_TEE("requesting llama server shutdown\n");
 +    ext_server_running = false;
 +    ext_server_thread.join();
 +    delete llama;
 +    llama = NULL;
 +    LOG_TEE("llama server shutdown complete\n");
 +}
 +
-+ext_server_completion_resp llama_server_completion(const char *json_req) {
+void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
-+    std::string msg;
+    assert(llama != NULL && json_req != NULL && resp != NULL);
-+    ext_server_completion_resp resp = {
+    resp->id = -1;
-+        0,
+    resp->msg[0] = '\0';
 +        NULL,
 +    };
 +    try {
 +        json data = json::parse(json_req);
-+        resp.task_id = llama.request_completion(data, false, false, -1);
+        resp->id = llama->request_completion(data, false, false, -1);
 +        return resp;
 +    } catch (std::exception &e) {
-+        msg = e.what();
+        snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
 +    } catch (...) {
-+        msg = "Unknown Exception during completion";
+        snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
 +    }
 +    const std::string::size_type size = msg.size();
 +    resp.task_id = 0;
 +    resp.err = new char[size + 1];
 +    memcpy(resp.err, msg.c_str(), size + 1);
 +    return resp;
 +}
 +
-+ext_task_result llama_server_completion_next_result(const int task_id) {
+void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *resp) {
 +    assert(llama != NULL && resp != NULL);
 +    std::string msg;
-+    ext_task_result resp = {-1,false,false,NULL};
+    resp->id = -1;
-+    try {
+    resp->stop = false;
-+        task_result result = llama.next_result(task_id);
+    resp->error = false;
 +        std::string result_json = result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
 +        const std::string::size_type size = result_json.size();
 +        resp.id = result.id;
 +        resp.stop = result.stop;
 +        resp.error = result.error;
 +        resp.result_json = new char[size + 1];
 +        memcpy(resp.result_json, result_json.c_str(), size + 1);
 +        if (result.error) {
 +            llama.request_cancel(task_id);
 +        } else if (result.stop) {
 +            llama.request_cancel(task_id);
 +        }
 +        return resp;
 +    } catch (std::exception &e) {
 +        msg = e.what(); // TODO - json?
 +    } catch (...) {
 +        msg = "Unknown Exception during completion";
 +    }
 +    resp.error = true;
 +    const std::string::size_type size = msg.size();
 +    resp.result_json = new char[size + 1];
 +    memcpy(resp.result_json, msg.c_str(), size + 1);
 +    return resp;
 +}
 +
 +ext_server_err llama_server_completion_cancel(const int task_id) {
 +    try {
 +        llama.request_cancel(task_id);
 +    } catch (std::exception &e) {
 +        return makeErr(1, e.what());
 +    } catch (...) {
 +        return makeErr(1, "Unknown Exception running llama server");
 +    }
 +    return makeErr(0, "");
 +}
 +
 +
 +ext_server_err llama_server_tokenize(const char *json_req, ext_server_resp *resp) {
 +    resp->json_resp = NULL;
 +    std::string result_json;
 +    try {
 +        task_result result = llama->next_result(task_id);
 +        result_json = result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
 +        resp->id = result.id;
 +        resp->stop = result.stop;
 +        resp->error = result.error;
 +        if (result.error) {
 +            llama->request_cancel(task_id);
 +        } else if (result.stop) {
 +            llama->request_cancel(task_id);
 +        }
 +    } catch (std::exception &e) {
 +        resp->error = true;
 +        resp->id = -1;
 +        result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
 +    } catch (...) {
 +        resp->error = true;
 +        resp->id = -1;
 +        result_json = "{\"error\":\"Unknown exception during completion\"}";
 +    }
 +    const std::string::size_type size = result_json.size() + 1;
 +    resp->json_resp = new char[size];
 +    snprintf(resp->json_resp, size, "%s", result_json.c_str());
 +}
 +
 +void llama_server_release_task_result(ext_server_task_result_t *result) {
 +    if (result == NULL || result->json_resp == NULL) {
 +        return;
 +    }
 +    delete[] result->json_resp;
 +}
 +
 +void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
 +    assert(llama != NULL && err != NULL);
 +    err->id = 0;
 +    err->msg[0] = '\0';
 +    try {
 +        llama->request_cancel(task_id);
 +    } catch (std::exception &e) {
 +        err->id = -1;
 +        snprintf(err->msg, err->msg_len, "exception %s", e.what());
 +    } catch (...) {
 +        err->id = -1;
 +        snprintf(err->msg, err->msg_len, "Unknown exception completion cancel in llama server");
 +    }
 +}
 +
 +void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) {
 +    assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
 +    *json_resp = NULL;
 +    err->id = 0;
 +    err->msg[0] = '\0';
 +    try {
 +        const json body = json::parse(json_req);
 +        std::vector<llama_token> tokens;
 +        if (body.count("content") != 0)
 +        {
-+            tokens = llama.tokenize(body["content"], false);
+            tokens = llama->tokenize(body["content"], false);
 +        }
 +        const json data = format_tokenizer_response(tokens);
 +        std::string result_json = data.dump();
-+        const std::string::size_type size = result_json.size();
+        const std::string::size_type size = result_json.size() + 1;
-+        resp->json_resp = new char[size + 1];
+        *json_resp = new char[size];
-+        memcpy(resp->json_resp, result_json.c_str(), size + 1);
+        snprintf(*json_resp, size, "%s", result_json.c_str());
 +    } catch (std::exception &e) {
-+        return makeErr(1, e.what());
+        err->id = -1;
 +        snprintf(err->msg, err->msg_len, "exception %s", e.what());
 +    } catch (...) {
-+        return makeErr(1, "Unknown Exception during tokenize");
+        err->id = -1;
 +        snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
 +    }
 +    return makeErr(0, "");
 +}
 +
-+ext_server_err llama_server_detokenize(const char *json_req, ext_server_resp *resp) {
+void llama_server_release_json_resp(char **json_resp) {
-+    resp->json_resp = NULL;
+    if (json_resp == NULL || *json_resp == NULL) {
 +        return;
 +    }
 +    delete[] *json_resp;
 +}
 +
 +void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) {
 +    assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
 +    *json_resp = NULL;
 +    err->id = 0;
 +    err->msg[0] = '\0';
 +    try {
 +        const json body = json::parse(json_req);
 +        std::string content;
 +        if (body.count("tokens") != 0)
 +        {
 +            const std::vector<llama_token> tokens = body["tokens"];
-+            content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
+            content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
 +        }
 +        const json data = format_detokenized_response(content);
 +        std::string result_json = data.dump();
-+        const std::string::size_type size = result_json.size();
+        const std::string::size_type size = result_json.size() + 1;
-+        resp->json_resp = new char[size + 1];
+        *json_resp = new char[size];
-+        memcpy(resp->json_resp, result_json.c_str(), size + 1);
+        snprintf(*json_resp, size, "%s", result_json.c_str());
 +    } catch (std::exception &e) {
-+        return makeErr(1, e.what());
+        err->id = -1;
 +        snprintf(err->msg, err->msg_len, "exception %s", e.what());
 +    } catch (...) {
-+        return makeErr(1, "Unknown Exception during detokenize");
+        err->id = -1;
 +        snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
 +    }
 +    return makeErr(0, "");
 +}
 +
-+ext_server_err llama_server_embedding(const char *json_req, ext_server_resp *resp) {
+void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err) {
-+    resp->json_resp = NULL;
+    assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
 +    *json_resp = NULL;
 +    err->id = 0;
 +    err->msg[0] = '\0';
 +    try {
 +        const json body = json::parse(json_req);
 +        json prompt;
@ -299,28 +325,29 @@ index 895f751..f939590 100644
 +        {
 +            prompt = "";
 +        }
-+        const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
+        const int task_id = llama->request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
-+        task_result result = llama.next_result(task_id);
+        task_result result = llama->next_result(task_id);
 +        std::string result_json = result.result_json.dump();
-+        const std::string::size_type size = result_json.size();
+        const std::string::size_type size = result_json.size() + 1;
-+        resp->json_resp = new char[size + 1];
+        *json_resp = new char[size];
-+        memcpy(resp->json_resp, result_json.c_str(), size + 1);
+        snprintf(*json_resp, size, "%s", result_json.c_str());
 +    } catch (std::exception &e) {
-+        return makeErr(1, e.what());
+        err->id = -1;
 +        snprintf(err->msg, err->msg_len, "exception %s", e.what());
 +    } catch (...) {
-+        return makeErr(1, "Unknown Exception during detokenize");
+        err->id = -1;
 +        snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
 +    }
 +    return makeErr(0, "");
 +}
 +
 +#endif // LLAMA_SERVER_LIBRARY
 \ No newline at end of file
 diff --git a/examples/server/server.h b/examples/server/server.h
 new file mode 100644
-index 0000000..4d03b1e
+index 0000000..d22f1b6
 --- /dev/null
 +++ b/examples/server/server.h
-@@ -0,0 +1,83 @@
+@@ -0,0 +1,89 @@
 +#if defined(LLAMA_SERVER_LIBRARY)
 +#ifndef LLAMA_SERVER_H
 +#define LLAMA_SERVER_H
@ -336,20 +363,23 @@ index 0000000..4d03b1e
 +extern "C"
 +{
 +#endif
-+    // TODO - clean the type def's up a bit for better consistency
+    typedef struct ext_server_resp {
-+    typedef struct ext_server_err {
+        int id; // < 0 on error
-+        uint32_t code; // 0 on success, > 0 on error
+        size_t msg_len; // caller must allocate msg and set msg_len
-+        char *err; // null if code == 0; else contains error message.  Caller responsible for freeing memory
+        char *msg;
-+    } ext_server_err;
+    } ext_server_resp_t;
 +
 +    // Allocated and freed by caller
 +    typedef struct ext_server_lora_adapter {
 +        char *adapter;
 +        float scale;
 +        struct ext_server_lora_adapter *next;
-+    } ext_server_lora_adapter;
+    } ext_server_lora_adapter_t;
 +
 +    // Allocated and freed by caller
 +    typedef struct ext_server_params
 +    {
-+        char *model;
+        char *model;            
 +        uint32_t n_ctx;         // text context, 0 = from model
 +        uint32_t n_batch;       // prompt processing maximum batch size
 +        uint32_t n_threads;     // number of threads to use for generation
@ -363,40 +393,43 @@ index 0000000..4d03b1e
 +        bool use_mmap;          // use mmap if possible
 +        bool numa;              // attempt optimizations that help on some NUMA systems
 +        bool embedding;         // get only sentence embedding
-+        ext_server_lora_adapter* lora_adapters;
+        ext_server_lora_adapter_t* lora_adapters;
-+    } ext_server_params;
+        char *mmproj;
 +    } ext_server_params_t;
 +
-+    // Initialize the server once per process
+    typedef struct ext_server_task_result
 +    ext_server_err llama_server_init(ext_server_params *sparams);
 +
 +    // Run the main loop
 +    void llama_server_start();
 +    // Stop the main loop
 +    void llama_server_stop();
 +
 +    typedef struct ext_task_result
 +    {
 +        int id;
 +        bool stop;
 +        bool error;
-+        char* result_json; // caller responsible to free this memory
+        char* json_resp; // null terminated, memory managed by ext_server
-+    } ext_task_result;
+    } ext_server_task_result_t;
 +    
 +    typedef struct ext_server_completion_resp {
 +        int task_id; // < 0 on error, >= 0 on success
 +        char *err; // null if task_id >= 0; else contains error message.  Caller responsible for freeing memory
 +    } ext_server_completion_resp;
 +    ext_server_completion_resp llama_server_completion(const char *json_req);
 +    ext_task_result llama_server_completion_next_result(const int task_id);
 +    ext_server_err llama_server_completion_cancel(const int task_id);
 +
-+    // Caller responsible for freeing json_resp
+    // Initialize the server once per process
-+    typedef struct ext_server_resp {
+    // err->id = 0 for success and err->msg[0] = NULL
-+        char *json_resp; // Caller responsible for freeing string
+    // err->id != 0 for failure, and err->msg contains error message
-+    } ext_server_resp;
+    void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
-+    ext_server_err llama_server_tokenize(const char *json_req, ext_server_resp *resp);
+
-+    ext_server_err llama_server_detokenize(const char *json_req, ext_server_resp *resp);
+    // Run the main loop, called once per init
-+    ext_server_err llama_server_embedding(const char *json_req, ext_server_resp *resp);
+    void llama_server_start();
 +    // Stop the main loop and free up resources allocated in init and start.  Init must be called again to reuse
 +    void llama_server_stop();
 +
 +    // json_req null terminated string, memory managed by caller
 +    // resp->id >= 0 on success (task ID)
 +    // resp->id < 0 on error, and resp->msg contains error message
 +    void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
 +
 +    // Caller must call llama_server_release_task_result to free resp->json_resp
 +    void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *result);
 +    void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
 +    void llama_server_release_task_result(ext_server_task_result_t *result);
 +
 +    // Caller must call llama_server_releaes_json_resp to free json_resp if err.id < 0
 +    void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err);
 +    void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err);
 +    void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err);
 +    void llama_server_release_json_resp(char **json_resp);
 +
 +#ifdef __cplusplus
 +}
@ -406,10 +439,10 @@ index 0000000..4d03b1e
 +#endif // LLAMA_SERVER_LIBRARY
 \ No newline at end of file
 diff --git a/ggml-cuda.cu b/ggml-cuda.cu
-index 85f7a29..ce51364 100644
+index 9e1acd3..ea64b55 100644
 --- a/ggml-cuda.cu
 +++ b/ggml-cuda.cu
-@@ -6410,6 +6410,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
+@@ -6505,6 +6505,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
         CUDA_CHECK(cudaGetDevice(&id));
         src_ptr = (char *) extra->data_device[id];
     } else {
--- a/llm/llama.go
+++ b/llm/llama.go
@ -3,6 +3,7 @@ package llm
 import (
 	"bytes"
 	"context"
 	_ "embed"
 	"errors"
 	"fmt"
 	"os"
@ -112,12 +113,6 @@ type ImageData struct {
 	ID   int    `json:"id"`
 }
 type llama struct {
 	api.Options
 	ImageData []ImageData
 	Running
 }
 var (
 	errNvidiaSMI     = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
 	errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
@ -166,7 +161,8 @@ type prediction struct {
 }
 const maxBufferSize = 512 * format.KiloByte
-const maxRetries = 6
+const maxRetries = 3
 const retryDelay = 1 * time.Second
 type PredictOpts struct {
 	Prompt string
--- a/llm/llm.go
+++ b/llm/llm.go
@ -11,6 +11,7 @@ import (
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/format"
 	"github.com/jmorganca/ollama/gpu"
 )
 type LLM interface {
@ -19,7 +20,6 @@ type LLM interface {
 	Encode(context.Context, string) ([]int, error)
 	Decode(context.Context, []int) (string, error)
 	Close()
 	Ping(context.Context) error
 }
 func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
@ -78,5 +78,17 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 	opts.NumGQA = 0
 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
-	return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
+	gpuInfo := gpu.GetGPUInfo()
 	switch gpuInfo.Driver {
 	case "ROCM":
 		return newRocmShimExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
 	default:
 		// Rely on the built-in CUDA based server which will fall back to CPU
 		return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
 	}
 }
 // Give any native cgo implementations an opportunity to initialize
 func Init(workdir string) error {
 	return nativeInit(workdir)
 }
--- a/llm/rocm_shim.c
+++ b/llm/rocm_shim.c
@ -0,0 +1,134 @@
 #include "rocm_shim.h"
 #include <stdio.h>
 #include <string.h>
 #ifndef _WIN32
 #include <dlfcn.h>
 #define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
 #define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
 #define LOAD_ERR() dlerror()
 #define UNLOAD_LIBRARY(handle) dlclose(handle)
 #else
 #include <windows.h>
 #define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
 #define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
 #define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
 // TODO - refactor this with proper error message handling on windows
 inline static char *LOAD_ERR() {
  static char errbuf[8];
  snprintf(errbuf, 8, "0x%lx", GetLastError());
  return errbuf;
 }
 #endif
 void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
                    ext_server_resp_t *err) {
  int i = 0;
  struct lookup {
    char *s;
    void **p;
  } l[] = {
      {"llama_server_init", (void *)&s->llama_server_init},
      {"llama_server_start", (void *)&s->llama_server_start},
      {"llama_server_stop", (void *)&s->llama_server_stop},
      {"llama_server_completion", (void *)&s->llama_server_completion},
      {"llama_server_completion_next_result",
       (void *)&s->llama_server_completion_next_result},
      {"llama_server_completion_cancel",
       (void *)&s->llama_server_completion_cancel},
      {"llama_server_release_task_result",
       (void *)&s->llama_server_release_task_result},
      {"llama_server_tokenize", (void *)&s->llama_server_tokenize},
      {"llama_server_detokenize", (void *)&s->llama_server_detokenize},
      {"llama_server_embedding", (void *)&s->llama_server_embedding},
      {"llama_server_release_json_resp",
       (void *)&s->llama_server_release_json_resp},
      {"", NULL},
  };
  printf("Lazy loading %s library\n", libPath);
  s->handle = LOAD_LIBRARY(libPath, RTLD_LAZY);
  if (!s->handle) {
    err->id = -1;
    snprintf(
        err->msg, err->msg_len,
        "Unable to load rocm server library: %s (If you have a Radeon card, "
        "did you install the ROCM libraries?)",
        LOAD_ERR());
    return;
  }
  for (i = 0; l[i].p != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(s->handle);
      err->id = -1;
      snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
               l[i].s, LOAD_ERR());
      return;
    }
  }
 }
 inline void rocm_shim_llama_server_init(struct rocm_llama_server s,
                                        ext_server_params_t *sparams,
                                        ext_server_resp_t *err) {
  s.llama_server_init(sparams, err);
 }
 inline void rocm_shim_llama_server_start(struct rocm_llama_server s) {
  s.llama_server_start();
 }
 inline void rocm_shim_llama_server_stop(struct rocm_llama_server s) {
  s.llama_server_stop();
 }
 inline void rocm_shim_llama_server_completion(struct rocm_llama_server s,
                                              const char *json_req,
                                              ext_server_resp_t *resp) {
  s.llama_server_completion(json_req, resp);
 }
 inline void rocm_shim_llama_server_completion_next_result(
    struct rocm_llama_server s, const int task_id,
    ext_server_task_result_t *result) {
  s.llama_server_completion_next_result(task_id, result);
 }
 inline void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s,
                                                     const int task_id,
                                                     ext_server_resp_t *err) {
  s.llama_server_completion_cancel(task_id, err);
 }
 inline void rocm_shim_llama_server_release_task_result(
    struct rocm_llama_server s, ext_server_task_result_t *result) {
  s.llama_server_release_task_result(result);
 }
 inline void rocm_shim_llama_server_tokenize(struct rocm_llama_server s,
                                            const char *json_req,
                                            char **json_resp,
                                            ext_server_resp_t *err) {
  s.llama_server_tokenize(json_req, json_resp, err);
 }
 inline void rocm_shim_llama_server_detokenize(struct rocm_llama_server s,
                                              const char *json_req,
                                              char **json_resp,
                                              ext_server_resp_t *err) {
  s.llama_server_detokenize(json_req, json_resp, err);
 }
 inline void rocm_shim_llama_server_embedding(struct rocm_llama_server s,
                                             const char *json_req,
                                             char **json_resp,
                                             ext_server_resp_t *err) {
  s.llama_server_embedding(json_req, json_resp, err);
 }
 inline void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s,
                                                     char **json_resp) {
  s.llama_server_release_json_resp(json_resp);
 }
--- a/llm/rocm_shim.h
+++ b/llm/rocm_shim.h
@ -0,0 +1,73 @@
 #include <stdlib.h>
 #include "server.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct rocm_llama_server {
  void *handle;
  void (*llama_server_init)(ext_server_params_t *sparams,
                            ext_server_resp_t *err);
  void (*llama_server_start)();
  void (*llama_server_stop)();
  void (*llama_server_completion)(const char *json_req,
                                  ext_server_resp_t *resp);
  void (*llama_server_completion_next_result)(const int task_id,
                                              ext_server_task_result_t *result);
  void (*llama_server_completion_cancel)(const int task_id,
                                         ext_server_resp_t *err);
  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
                                ext_server_resp_t *err);
  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
                                  ext_server_resp_t *err);
  void (*llama_server_embedding)(const char *json_req, char **json_resp,
                                 ext_server_resp_t *err);
  void (*llama_server_release_json_resp)(char **json_resp);
 };
 void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
                    ext_server_resp_t *err);
 // No good way to call C function pointers from Go so inline the indirection
 void rocm_shim_llama_server_init(struct rocm_llama_server s,
                                 ext_server_params_t *sparams,
                                 ext_server_resp_t *err);
 void rocm_shim_llama_server_start(struct rocm_llama_server s);
 void rocm_shim_llama_server_stop(struct rocm_llama_server s);
 void rocm_shim_llama_server_completion(struct rocm_llama_server s,
                                       const char *json_req,
                                       ext_server_resp_t *resp);
 void rocm_shim_llama_server_completion_next_result(
    struct rocm_llama_server s, const int task_id,
    ext_server_task_result_t *result);
 void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s,
                                              const int task_id,
                                              ext_server_resp_t *err);
 void rocm_shim_llama_server_release_task_result(
    struct rocm_llama_server s, ext_server_task_result_t *result);
 void rocm_shim_llama_server_tokenize(struct rocm_llama_server s,
                                     const char *json_req, char **json_resp,
                                     ext_server_resp_t *err);
 void rocm_shim_llama_server_detokenize(struct rocm_llama_server s,
                                       const char *json_req, char **json_resp,
                                       ext_server_resp_t *err);
 void rocm_shim_llama_server_embedding(struct rocm_llama_server s,
                                      const char *json_req, char **json_resp,
                                      ext_server_resp_t *err);
 void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s,
                                              char **json_resp);
 #ifdef __cplusplus
 }
 #endif
--- a/llm/shim_darwin.go
+++ b/llm/shim_darwin.go
@ -0,0 +1,18 @@
 package llm
 import (
 	"fmt"
 	"github.com/jmorganca/ollama/api"
 )
 // no-op stubs for mac
 func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
 	// should never happen...
 	return nil, fmt.Errorf("ROCM GPUs not supported on Mac")
 }
 func nativeInit(workDir string) error {
 	return nil
 }
--- a/llm/shim_ext_server.go
+++ b/llm/shim_ext_server.go
@ -0,0 +1,212 @@
 //go:build !darwin
 package llm
 /*
 #include <stdlib.h>
 #include "rocm_shim.h"
 */
 import "C"
 import (
 	"context"
 	"embed"
 	"errors"
 	"fmt"
 	"io"
 	"io/fs"
 	"log"
 	"os"
 	"path/filepath"
 	"runtime"
 	"sync"
 	"unsafe"
 	"github.com/jmorganca/ollama/api"
 )
 //go:embed llama.cpp/gguf/build/*/lib/*
 var libEmbed embed.FS
 var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported")
 var NoShim = true
 type shimExtServer struct {
 	s       C.struct_rocm_llama_server
 	options api.Options
 }
 // Note: current implementation does not support concurrent instantiations
 var shimMutex sync.Mutex
 var llm *shimExtServer
 func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
 	C.rocm_shim_llama_server_init(llm.s, sparams, err)
 }
 func (llm *shimExtServer) llama_server_start() {
 	C.rocm_shim_llama_server_start(llm.s)
 }
 func (llm *shimExtServer) llama_server_stop() {
 	C.rocm_shim_llama_server_stop(llm.s)
 }
 func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
 	C.rocm_shim_llama_server_completion(llm.s, json_req, resp)
 }
 func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
 	C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp)
 }
 func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
 	C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err)
 }
 func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
 	C.rocm_shim_llama_server_release_task_result(llm.s, result)
 }
 func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
 	C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
 }
 func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
 	C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
 }
 func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
 	C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
 }
 func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
 	C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp)
 }
 func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
 	if NoShim {
 		return nil, RocmShimMissing
 	}
 	log.Printf("Loading ROCM llm server")
 	if llm == nil {
 		return nil, fmt.Errorf("nativeInit wasnt called or libary load failed")
 	}
 	llm.options = opts
 	return newExtServer(llm, model, adapters, projectors, numLayers, opts)
 }
 func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
 	return predict(llm, llm.options, ctx, pred, fn)
 }
 func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
 	return encode(llm, ctx, prompt)
 }
 func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
 	return decode(llm, ctx, tokens)
 }
 func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
 	return embedding(llm, ctx, input)
 }
 func (llm *shimExtServer) Close() {
 	close(llm)
 }
 func nativeInit(workdir string) error {
 	err := extractLib(workdir)
 	if err != nil {
 		if err == RocmShimMissing {
 			log.Printf("%s", err)
 			return nil
 		}
 		return err
 	}
 	// Verify we have permissions - either running as root, or we have group access to the driver
 	fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
 	if err != nil {
 		if errors.Is(err, fs.ErrPermission) {
 			log.Fatalf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
 			return err
 		} else if errors.Is(err, fs.ErrNotExist) {
 			// expected behavior without a radeon card
 			return nil
 		}
 		return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
 	}
 	fd.Close()
 	shimMutex.Lock()
 	defer shimMutex.Unlock()
 	if llm != nil {
 		return nil
 	}
 	var libName string
 	switch runtime.GOOS {
 	case "darwin":
 		// shouldn't happen
 		return nil
 	case "linux":
 		libName = "librocm_server.so"
 	case "windows":
 		libName = "rocm_server.dll"
 	default:
 		// shouldn't happen
 		return nil
 	}
 	libPath := C.CString(filepath.Join(workdir, libName))
 	defer C.free(unsafe.Pointer(libPath))
 	resp := newExtServerResp(128)
 	defer freeExtServerResp(resp)
 	var srv C.struct_rocm_llama_server
 	C.rocm_shim_init(libPath, &srv, &resp)
 	if resp.id < 0 {
 		// TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm
 		//        and run against CPU
 		return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg))
 	}
 	llm = &shimExtServer{
 		s:       srv,
 		options: api.DefaultOptions(),
 	}
 	return nil
 }
 func extractLib(workDir string) error {
 	files, err := fs.Glob(libEmbed, "llama.cpp/gguf/build/*/lib/*rocm_server*")
 	if err != nil || len(files) == 0 {
 		// this is expected, ollama may be compiled without shim library packed in
 		return RocmShimMissing
 	}
 	if len(files) != 1 {
 		// Shouldn't happen, but just use the first one we find
 		log.Printf("WARNING: multiple rocm libraries detected - using %s", files[0])
 	}
 	srcFile, err := libEmbed.Open(files[0])
 	if err != nil {
 		return fmt.Errorf("read ROCm shim %s: %v", files[0], err)
 	}
 	defer srcFile.Close()
 	if err := os.MkdirAll(workDir, 0o755); err != nil {
 		return fmt.Errorf("create ROCm shim temp dir %s: %v", workDir, err)
 	}
 	destFile := filepath.Join(workDir, filepath.Base(files[0]))
 	_, err = os.Stat(destFile)
 	switch {
 	case errors.Is(err, os.ErrNotExist):
 		destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
 		if err != nil {
 			return fmt.Errorf("write ROCm shim %s: %v", files[0], err)
 		}
 		defer destFile.Close()
 		if _, err := io.Copy(destFile, srcFile); err != nil {
 			return fmt.Errorf("copy ROCm shim %s: %v", files[0], err)
 		}
 	case err != nil:
 		return fmt.Errorf("stat ROCm shim %s: %v", files[0], err)
 	}
 	NoShim = false
 	return nil
 }
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@ -8,7 +8,7 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version
 mkdir -p dist
 for TARGETARCH in amd64 arm64; do
-    docker buildx build --load --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
+    docker buildx build --load --progress=plain --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
    docker rm builder-$TARGETARCH
--- a/scripts/build_remote.py
+++ b/scripts/build_remote.py
@ -0,0 +1,68 @@
 #!/usr/bin/env python3
 import subprocess
 import sys
 from urllib.parse import urlparse
 from git import Repo
 # Helper script to be able to build on remote repos using git to push local changes
 # (e.g. particularly helpful to target a remote windows build system)
 #
 # Typical windows remote git config looks like this:
 #
 #[remote "windows-pa"]
 #        url = jdoe@desktop-foo:C:/Users/Jdoe/code/ollama
 #        fetch = +refs/heads/*:refs/remotes/windows-pa/*
 #        uploadpack = powershell git upload-pack
 #        receivepack = powershell git receive-pack
 #
 # TODO - add argpare and make this more configurable 
 # - force flag becomes optional
 # - generate, build or test ...
 # Note: remote repo will need this run once:
 # git config --local receive.denyCurrentBranch updateInstead
 repo = Repo(".")
 # On linux, add links in /usr/local/bin to the go binaries to avoid needing this
 # GoCmd = "/usr/local/go/bin/go" 
 GoCmd = "go" 
 if repo.is_dirty():
    print("Tree is dirty.  Commit your changes before running this script")
    sys.exit(1)
 if len(sys.argv) != 2:
    print("Please specify the remote name: " + ', '.join([r.name for r in repo.remotes]))
    sys.exit(1)
 remote_name = sys.argv[1]
 remote = {r.name: r for r in repo.remotes}[remote_name]
 raw_url = list(remote.urls)[0]
 url = urlparse(raw_url)
 # Windows urls don't quite parse properly
 if url.scheme == "" and url.netloc == "":
    url = urlparse("ssh://" + raw_url)
 print("URL: " + str(url))
 netloc = url.netloc.split(":")[0]
 path = url.path
 branch_name = repo.active_branch.name
 print("Force pushing content to remote...")
 # Use with care given the force push
 remote.push(force=True).raise_if_error()
 print("Ensuring correct branch checked out on remote via ssh...")
 subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'git', 'checkout', branch_name])
 # TODO - add some hardening to try to figure out how to set up the path properly
 # subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'env'])
 # TODO - or consider paramiko maybe
 print("Performing generate")
 subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'generate', './...'])
 print("Building")
 subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.'])
--- a/scripts/setup_integration_tests.sh
+++ b/scripts/setup_integration_tests.sh
@ -32,4 +32,4 @@ for LAYER in $(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_M
    curl -L -C - --header "${ACCEPT_HEADER}" \
        -o ${OLLAMA_MODELS}/blobs/${LAYER} \
        ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${LAYER}
-done
+done
--- a/server/llm_test.go
+++ b/server/llm_test.go
@ -2,14 +2,17 @@ package server
 import (
 	"context"
 	"os"
 	"strings"
 	"sync"
 	"testing"
 	"time"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/llm"
 )
 // TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
@ -33,12 +36,16 @@ var (
 	}
 	resp = [2]string{
 		"once upon a time",
-		"fourth thursday",
+		"united states thanksgiving",
 	}
 )
 func TestIntegrationSimpleOrcaMini(t *testing.T) {
 	SkipIFNoTestData(t)
 	workDir, err := os.MkdirTemp("", "ollama")
 	require.NoError(t, err)
 	defer os.RemoveAll(workDir)
 	require.NoError(t, llm.Init(workDir))
 	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
 	defer cancel()
 	opts := api.DefaultOptions()
@ -56,7 +63,13 @@ func TestIntegrationSimpleOrcaMini(t *testing.T) {
 // get true concurrency working with n_parallel support in the backend
 func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 	SkipIFNoTestData(t)
 	t.Skip("concurrent prediction on single runner not currently supported")
 	workDir, err := os.MkdirTemp("", "ollama")
 	require.NoError(t, err)
 	defer os.RemoveAll(workDir)
 	require.NoError(t, llm.Init(workDir))
 	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
 	defer cancel()
 	opts := api.DefaultOptions()
@ -79,6 +92,10 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
 func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) {
 	SkipIFNoTestData(t)
 	workDir, err := os.MkdirTemp("", "ollama")
 	require.NoError(t, err)
 	defer os.RemoveAll(workDir)
 	require.NoError(t, llm.Init(workDir))
 	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
 	defer cancel()
 	opts := api.DefaultOptions()
@ -87,6 +104,7 @@ func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) {
 	var wg sync.WaitGroup
 	wg.Add(len(req))
 	t.Logf("Running %d concurrently", len(req))
 	for i := 0; i < len(req); i++ {
 		go func(i int) {
 			defer wg.Done()
--- a/server/routes.go
+++ b/server/routes.go
@ -25,6 +25,7 @@ import (
 	"github.com/gin-gonic/gin"
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/gpu"
 	"github.com/jmorganca/ollama/llm"
 	"github.com/jmorganca/ollama/parser"
 	"github.com/jmorganca/ollama/version"
@ -81,20 +82,6 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess
 		return nil, err
 	}
 	ctx := c.Request.Context()
 	// check if the loaded model is still running in a subprocess, in case something unexpected happened
 	if loaded.runner != nil {
 		if err := loaded.runner.Ping(ctx); err != nil {
 			log.Print("loaded llm process not responding, closing now")
 			// the subprocess is no longer running, so close it
 			loaded.runner.Close()
 			loaded.runner = nil
 			loaded.Model = nil
 			loaded.Options = nil
 		}
 	}
 	needLoad := loaded.runner == nil || // is there a model loaded?
 		loaded.ModelPath != model.ModelPath || // has the base model changed?
 		!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
@ -905,9 +892,12 @@ func Serve(ln net.Listener) error {
 		os.Exit(0)
 	}()
-	if runtime.GOOS == "linux" {
+	if err := llm.Init(s.WorkDir); err != nil {
 		return fmt.Errorf("unable to initialize llm library %w", err)
 	}
 	if runtime.GOOS == "linux" { // TODO - windows too
 		// check compatibility to log warnings
-		if _, err := llm.CheckVRAM(); err != nil {
+		if _, err := gpu.CheckVRAM(); err != nil {
 			log.Print(err.Error())
 		}
 	}
`@ -1,3 +1,3 @@`
	`package llm`	`package llm`

	`//go:generate sh ./gen_linux.sh`	`//go:generate bash ./gen_linux.sh`