Adapted rocm support to cgo based llama.cpp

This commit is contained in:
Daniel Hiltgen 2023-11-29 11:00:37 -08:00
parent f8ef4439e9
commit 35934b2e05
37 changed files with 1688 additions and 658 deletions

View file

@ -11,8 +11,8 @@ RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
COPY . . COPY . .
ENV GOARCH=$TARGETARCH ENV GOARCH=$TARGETARCH
ENV GOFLAGS=$GOFLAGS ENV GOFLAGS=$GOFLAGS
RUN /usr/local/go/bin/go generate -tags cuda ./... \ RUN /usr/local/go/bin/go generate ./... \
&& /usr/local/go/bin/go build -tags cuda . && /usr/local/go/bin/go build .
FROM ubuntu:22.04 FROM ubuntu:22.04
RUN apt-get update && apt-get install -y ca-certificates RUN apt-get update && apt-get install -y ca-certificates
@ -27,5 +27,3 @@ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
ENTRYPOINT ["/bin/ollama"] ENTRYPOINT ["/bin/ollama"]
CMD ["serve"] CMD ["serve"]

View file

@ -1,19 +1,44 @@
# centos7 amd64 dependencies # Ubuntu 20.04 amd64 dependencies
FROM --platform=linux/amd64 nvidia/cuda:11.3.1-devel-centos7 AS base-amd64 FROM --platform=linux/amd64 nvidia/cuda:11.7.1-devel-ubuntu22.04 AS base-amd64
RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \ # ROCm only supports amd64
yum update -y && \ ARG ROCM_VERSION=5.7
yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget # Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local RUN apt-get update && \
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH apt-get install -y wget && \
wget "https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \
chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
mkdir --parents --mode=0755 /etc/apt/keyrings && \
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \
echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \
echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \
echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev
# centos8 arm64 dependencies ENV ROCM_PATH=/opt/rocm
FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64
RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* # Ubuntu 22.04 arm64 dependencies
RUN yum install -y git cmake FROM --platform=linux/arm64 nvidia/cuda:11.7.1-devel-ubuntu22.04 AS base-arm64
RUN apt-get update && \
apt-get install -y wget && \
wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \
chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr
FROM base-${TARGETARCH} FROM base-${TARGETARCH}
ARG TARGETARCH ARG TARGETARCH
ARG GOFLAGS="'-ldflags -w -s'" ARG GOFLAGS="'-ldflags -w -s'"
ARG CGO_CFLAGS
ARG CLBLAST_VER=1.6.1
# Common toolchain
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-11 g++-11 cpp-11 git ocl-icd-opencl-dev && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 --slave /usr/bin/g++ g++ /usr/bin/g++-11 --slave /usr/bin/gcov gcov /usr/bin/gcov-11
# CLBlast
RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \
cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install
# install go # install go
ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
@ -26,6 +51,7 @@ COPY . .
ENV GOOS=linux ENV GOOS=linux
ENV GOARCH=$TARGETARCH ENV GOARCH=$TARGETARCH
ENV GOFLAGS=$GOFLAGS ENV GOFLAGS=$GOFLAGS
ENV CGO_CFLAGS=${CGO_CFLAGS}
RUN /usr/local/go/bin/go generate -tags cuda ./... && \ RUN /usr/local/go/bin/go generate ./... && \
/usr/local/go/bin/go build -tags cuda . /usr/local/go/bin/go build .

View file

@ -185,8 +185,6 @@ ollama list
## Building ## Building
### Generic (CPU)
Install `cmake` and `go`: Install `cmake` and `go`:
``` ```
@ -202,32 +200,36 @@ Then build the binary:
go build . go build .
``` ```
### CUDA (NVIDIA) ### Linux/Windows CUDA (NVIDIA)
*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!* *Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) development and runtime packages. Note: at present, Ollama is optimized for GPU usage on linux, and requires the CUDA libraries at a minimum to compile even if you do not have an NVIDIA GPU.
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
Then generate dependencies: Then generate dependencies:
``` ```
go generate -tags cuda ./... go generate ./...
``` ```
Then build the binary: Then build the binary:
``` ```
go build -tags cuda . go build .
``` ```
### ROCm (AMD) ### Linux ROCm (AMD)
*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!* *Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`. Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies: Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
``` ```
CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate -tags rocm ./... CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
``` ```
Then build the binary: Then build the binary:
``` ```
go build -tags rocm go build .
``` ```
ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
### Running local builds ### Running local builds
Next, start the server: Next, start the server:

119
gpu/gpu.go Normal file
View file

@ -0,0 +1,119 @@
//go:build linux || windows
package gpu
/*
#include "gpu_info.h"
*/
import "C"
import (
"fmt"
"log"
"sync"
"unsafe"
"github.com/jmorganca/ollama/api"
)
type handles struct {
cuda *C.cuda_handle_t
rocm *C.rocm_handle_t
}
var gpuMutex sync.Mutex
var gpuHandles *handles = nil
// Note: gpuMutex must already be held
func initGPUHandles() {
log.Printf("Detecting GPU type")
gpuHandles = &handles{nil, nil}
var resp C.cuda_init_resp_t
C.cuda_init(&resp)
if resp.err != nil {
log.Printf("CUDA not detected: %s", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
var resp C.rocm_init_resp_t
C.rocm_init(&resp)
if resp.err != nil {
log.Printf("ROCm not detected: %s", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
} else {
log.Printf("Radeon GPU detected")
rocm := resp.rh
gpuHandles.rocm = &rocm
}
} else {
log.Printf("Nvidia GPU detected")
cuda := resp.ch
gpuHandles.cuda = &cuda
}
}
func GetGPUInfo() GpuInfo {
// TODO - consider exploring lspci (and equivalent on windows) to check for
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
gpuMutex.Lock()
defer gpuMutex.Unlock()
if gpuHandles == nil {
initGPUHandles()
}
var memInfo C.mem_info_t
var resp GpuInfo
if gpuHandles.cuda != nil {
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
resp.Driver = "CUDA"
} else if gpuHandles.rocm != nil {
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
resp.Driver = "ROCM"
} else {
C.cpu_check_ram(&memInfo)
resp.Driver = "CPU"
}
if memInfo.err != nil {
log.Printf("error looking up GPU memory: %s", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
}
resp.FreeMemory = uint64(memInfo.free)
resp.TotalMemory = uint64(memInfo.total)
return resp
}
func CheckVRAM() (int64, error) {
gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" {
return int64(gpuInfo.FreeMemory), nil
}
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
}
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
if opts.NumGPU != -1 {
return opts.NumGPU
}
info := GetGPUInfo()
if info.Driver == "CPU" {
return 0
}
/*
Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
We can store the model weights and the kv cache in vram,
to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
*/
bytesPerLayer := uint64(fileSizeBytes / numLayer)
// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
// TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU
// if int64(layers) < numLayer {
// log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024))
// return 0
// }
log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", info.FreeMemory/(1024*1024), layers, numLayer)
return layers
}

View file

@ -1,7 +1,8 @@
//go:build darwin //go:build darwin
package llm package gpu
import "C"
import ( import (
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
) )
@ -9,11 +10,25 @@ import (
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) { func CheckVRAM() (int64, error) {
// TODO - assume metal, and return free memory? // TODO - assume metal, and return free memory?
return 0, errNvidiaSMI return 0, nil
} }
func GetGPUInfo() GpuInfo {
// TODO - Metal vs. x86 macs...
return GpuInfo{
Driver: "METAL",
TotalMemory: 0,
FreeMemory: 0,
}
}
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
// default to enable metal on macOS // default to enable metal on macOS
return 1 return 1
} }
func nativeInit() error {
return nil
}

49
gpu/gpu_info.h Normal file
View file

@ -0,0 +1,49 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_H__
#define __GPU_INFO_H__
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#ifndef _WIN32
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() dlerror()
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#else
#include <windows.h>
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
// TODO - refactor this with proper error message handling on windows
inline static char *LOAD_ERR() {
static char errbuf[8];
snprintf(errbuf, 8, "0x%lx", GetLastError());
return errbuf;
}
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef struct mem_info {
uint64_t total;
uint64_t free;
char *err; // If non-nill, caller responsible for freeing
} mem_info_t;
void cpu_check_ram(mem_info_t *resp);
#ifdef __cplusplus
}
#endif
#include "gpu_info_cuda.h"
#include "gpu_info_rocm.h"
#endif // __GPU_INFO_H__
#endif // __APPLE__

42
gpu/gpu_info_cpu.c Normal file
View file

@ -0,0 +1,42 @@
#include "gpu_info.h"
// Fallbacks for CPU mode
#ifdef _WIN32
#include <sysinfoapi.h>
void cpu_check_ram(mem_info_t *resp) {
resp->err = NULL;
MEMORYSTATUSEX info;
if (GlobalMemoryStatusEx(&info) != 0) {
resp->total = info.ullTotalPhys;
resp->free = info.ullAvailPhys;
} else {
resp->err = strdup(LOAD_ERR());
}
return;
}
#elif __linux__
#include <errno.h>
#include <string.h>
#include <sys/sysinfo.h>
void cpu_check_ram(mem_info_t *resp) {
struct sysinfo info;
resp->err = NULL;
if (sysinfo(&info) != 0) {
resp->err = strdup(strerror(errno));
} else {
resp->total = info.totalram * info.mem_unit;
resp->free = info.freeram * info.mem_unit;
}
return;
}
#elif __APPLE__
// TODO consider an Apple implementation that does something useful
// mem_info_t cpu_check_ram() {
// mem_info_t resp = {0, 0, NULL};
// return resp;
// }
#else
#error "Unsupported platform"
#endif

110
gpu/gpu_info_cuda.c Normal file
View file

@ -0,0 +1,110 @@
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include "gpu_info_cuda.h"
#include <string.h>
#ifndef _WIN32
const char *cuda_lib_paths[] = {
"libnvidia-ml.so",
"/usr/local/cuda/lib64/libnvidia-ml.so",
NULL,
};
#else
const char *cuda_lib_paths[] = {
"nvml.dll",
"",
NULL,
};
#endif
void cuda_init(cuda_init_resp_t *resp) {
resp->err = NULL;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup {
char *s;
void **p;
} l[4] = {
{"nvmlInit_v2", (void *)&resp->ch.initFn},
{"nvmlShutdown", (void *)&resp->ch.shutdownFn},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
};
for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
}
if (!resp->ch.handle) {
snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s",
cuda_lib_paths[0], LOAD_ERR());
resp->err = strdup(buf);
return;
}
for (i = 0; i < 4; i++) { // TODO - fix this to use a null terminated list
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
LOAD_ERR());
resp->err = strdup(buf);
return;
}
}
return;
}
void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
resp->err = NULL;
nvmlDevice_t device;
nvmlMemory_t memInfo = {0};
nvmlReturn_t ret;
const int buflen = 256;
char buf[buflen + 1];
int i;
if (h.handle == NULL) {
resp->err = strdup("nvml handle sn't initialized");
return;
}
ret = (*h.initFn)();
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
resp->err = strdup(buf);
return;
}
// TODO - handle multiple GPUs
ret = (*h.getHandle)(0, &device);
if (ret != NVML_SUCCESS) {
(*h.shutdownFn)();
snprintf(buf, buflen, "unable to get device handle: %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.getMemInfo)(device, &memInfo);
if (ret != NVML_SUCCESS) {
(*h.shutdownFn)();
snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
resp->total = memInfo.total;
resp->free = memInfo.free;
ret = (*h.shutdownFn)();
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "nvml vram shutdown failure: %d", ret);
resp->err = strdup(buf);
}
return;
}
#endif // __APPLE__

35
gpu/gpu_info_cuda.h Normal file
View file

@ -0,0 +1,35 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_CUDA_H__
#define __GPU_INFO_CUDA_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum nvmlReturn_enum {
NVML_SUCCESS = 0,
// Other values omitted for now...
} nvmlReturn_t;
typedef void *nvmlDevice_t; // Opaque is sufficient
typedef struct nvmlMemory_st {
unsigned long long total;
unsigned long long free;
unsigned long long used;
} nvmlMemory_t;
typedef struct cuda_handle {
void *handle;
nvmlReturn_t (*initFn)(void);
nvmlReturn_t (*shutdownFn)(void);
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
} cuda_handle_t;
typedef struct cuda_init_resp {
char *err; // If err is non-null handle is invalid
cuda_handle_t ch;
} cuda_init_resp_t;
void cuda_init(cuda_init_resp_t *resp);
void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
#endif // __GPU_INFO_CUDA_H__
#endif // __APPLE__

111
gpu/gpu_info_rocm.c Normal file
View file

@ -0,0 +1,111 @@
#ifndef __APPLE__
#include "gpu_info_rocm.h"
#include <string.h>
#ifndef _WIN32
const char *rocm_lib_paths[] = {
"librocm_smi64.so",
"/opt/rocm/lib/librocm_smi64.so",
NULL,
};
#else
// TODO untested
const char *rocm_lib_paths[] = {
"rocm_smi64.dll",
"/opt/rocm/lib/rocm_smi64.dll",
NULL,
};
#endif
void rocm_init(rocm_init_resp_t *resp) {
resp->err = NULL;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup {
char *s;
void **p;
} l[4] = {
{"rsmi_init", (void *)&resp->rh.initFn},
{"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
{"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
// { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
};
for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
}
if (!resp->rh.handle) {
snprintf(buf, buflen,
"Unable to load %s library to query for Radeon GPUs: %s\n",
rocm_lib_paths[0], LOAD_ERR());
resp->err = strdup(buf);
return;
}
for (i = 0; i < 4; i++) {
*l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(resp->rh.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
LOAD_ERR());
resp->err = strdup(buf);
return;
}
}
return;
}
void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
resp->err = NULL;
// uint32_t num_devices;
// uint16_t device;
uint64_t totalMem = 0;
uint64_t usedMem = 0;
rsmi_status_t ret;
const int buflen = 256;
char buf[buflen + 1];
int i;
ret = (*h.initFn)(0);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm vram init failure: %d", ret);
resp->err = strdup(buf);
return;
}
// TODO - iterate through devices... ret =
// rsmi_num_monitor_devices(&num_devices);
// ret = (*h.getHandle)(0, &device);
// if (ret != RSMI_STATUS_SUCCESS) {
// printf("rocm vram device lookup failure: %d\n", ret);
// return -1;
// }
// Get total memory - used memory for available memory
ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
if (ret != RSMI_STATUS_SUCCESS) {
(*h.shutdownFn)();
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
if (ret != RSMI_STATUS_SUCCESS) {
(*h.shutdownFn)();
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
(*h.shutdownFn)();
resp->total = totalMem;
resp->free = totalMem - usedMem;
return;
}
#endif // __APPLE__

36
gpu/gpu_info_rocm.h Normal file
View file

@ -0,0 +1,36 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_ROCM_H__
#define __GPU_INFO_ROCM_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum rsmi_status_return {
RSMI_STATUS_SUCCESS = 0,
// Other values omitted for now...
} rsmi_status_t;
typedef enum rsmi_memory_type {
RSMI_MEM_TYPE_VRAM = 0,
RSMI_MEM_TYPE_VIS_VRAM,
RSMI_MEM_TYPE_GTT,
} rsmi_memory_type_t;
typedef struct rocm_handle {
void *handle;
rsmi_status_t (*initFn)(uint64_t);
rsmi_status_t (*shutdownFn)(void);
rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
// rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
} rocm_handle_t;
typedef struct rocm_init_resp {
char *err; // If err is non-null handle is invalid
rocm_handle_t rh;
} rocm_init_resp_t;
void rocm_init(rocm_init_resp_t *resp);
void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
#endif // __GPU_INFO_ROCM_H__
#endif // __APPLE__

26
gpu/gpu_test.go Normal file
View file

@ -0,0 +1,26 @@
package gpu
import (
"runtime"
"testing"
"github.com/stretchr/testify/assert"
)
func TestBasicGetGPUInfo(t *testing.T) {
info := GetGPUInfo()
assert.Contains(t, "CUDA ROCM CPU METAL", info.Driver)
switch runtime.GOOS {
case "darwin":
// TODO - remove this once MacOS returns some size for CPU
return
case "linux", "windows":
assert.Greater(t, info.TotalMemory, uint64(0))
assert.Greater(t, info.FreeMemory, uint64(0))
default:
return
}
}
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected

10
gpu/types.go Normal file
View file

@ -0,0 +1,10 @@
package gpu
// Beginning of an `ollama info` command
type GpuInfo struct {
Driver string `json:"driver,omitempty"`
TotalMemory uint64 `json:"total_memory,omitempty"`
FreeMemory uint64 `json:"free_memory,omitempty"`
// TODO add other useful attributes about the card here for discovery information
}

View file

@ -1,67 +0,0 @@
//go:build cuda
package llm
import (
"bufio"
"bytes"
"errors"
"fmt"
"log"
"os/exec"
"path"
"strconv"
"strings"
"github.com/jmorganca/ollama/format"
)
var (
errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
)
// acceleratedRunner returns the runner for this accelerator given the provided buildPath string.
func acceleratedRunner(buildPath string) []ModelRunner {
return []ModelRunner{
ModelRunner{
Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"),
Accelerated: true,
},
}
}
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) {
cmd := exec.Command("nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits")
var stdout bytes.Buffer
cmd.Stdout = &stdout
err := cmd.Run()
if err != nil {
return 0, errNoAccel
}
var freeMiB int64
scanner := bufio.NewScanner(&stdout)
for scanner.Scan() {
line := scanner.Text()
if strings.Contains(line, "[Insufficient Permissions]") {
return 0, fmt.Errorf("GPU support may not enabled, check you have installed GPU drivers and have the necessary permissions to run nvidia-smi")
}
vram, err := strconv.ParseInt(strings.TrimSpace(line), 10, 64)
if err != nil {
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
}
freeMiB += vram
}
freeBytes := freeMiB * 1024 * 1024
if freeBytes < 2*format.GigaByte {
log.Printf("less than 2 GB VRAM available")
return 0, errAvailableVRAM
}
return freeBytes, nil
}

View file

@ -1,21 +0,0 @@
//go:build !rocm && !cuda
package llm
import (
"errors"
)
var (
errNoAccel = errors.New("no accelerator support in this binary")
)
// acceleratedRunner returns the runner for this accelerator given the provided buildPath string.
func acceleratedRunner(buildPath string) []ModelRunner {
return make([]ModelRunner, 0, 1)
}
// CheckVRAM is a stub with no accelerator.
func CheckVRAM() (int64, error) {
return 0, errNoGPU
}

View file

@ -1,85 +0,0 @@
//go:build rocm
package llm
import (
"bytes"
"encoding/csv"
"errors"
"fmt"
"io"
"log"
"os"
"os/exec"
"path"
"path/filepath"
"strconv"
"strings"
)
var errNoAccel = errors.New("rocm-smi command failed")
// acceleratedRunner returns the runner for this accelerator given the provided buildPath string.
func acceleratedRunner(buildPath string) []ModelRunner {
return []ModelRunner{
ModelRunner{
Path: path.Join(buildPath, "rocm", "bin", "ollama-runner"),
Accelerated: true,
},
}
}
// CheckVRAM returns the available VRAM in MiB on Linux machines with AMD GPUs
func CheckVRAM() (int64, error) {
rocmHome := os.Getenv("ROCM_PATH")
if rocmHome == "" {
rocmHome = os.Getenv("ROCM_HOME")
}
if rocmHome == "" {
log.Println("warning: ROCM_PATH is not set. Trying a likely fallback path, but it is recommended to set this variable in the environment.")
rocmHome = "/opt/rocm"
}
cmd := exec.Command(filepath.Join(rocmHome, "bin/rocm-smi"), "--showmeminfo", "VRAM", "--csv")
var stdout bytes.Buffer
cmd.Stdout = &stdout
err := cmd.Run()
if err != nil {
return 0, errNoAccel
}
csvData := csv.NewReader(&stdout)
// llama.cpp or ROCm don't seem to understand splitting the VRAM allocations across them properly, so try to find the biggest card instead :(. FIXME.
totalBiggestCard := int64(0)
bigCardName := ""
for {
record, err := csvData.Read()
if err == io.EOF {
break
}
if err != nil {
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
}
if !strings.HasPrefix(record[0], "card") {
continue
}
cardTotal, err := strconv.ParseInt(record[1], 10, 64)
if err != nil {
return 0, err
}
cardUsed, err := strconv.ParseInt(record[2], 10, 64)
if err != nil {
return 0, err
}
possible := (cardTotal - cardUsed)
log.Printf("ROCm found %d MiB of available VRAM on device %q", possible/1024/1024, record[0])
if possible > totalBiggestCard {
totalBiggestCard = possible
bigCardName = record[0]
}
}
if totalBiggestCard == 0 {
log.Printf("found ROCm GPU but failed to parse free VRAM!")
return 0, errNoAccel
}
log.Printf("ROCm selecting device %q", bigCardName)
return totalBiggestCard, nil
}

View file

@ -1,7 +1,7 @@
package llm package llm
/* /*
#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common #cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
#cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds #cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
@ -25,6 +25,8 @@ package llm
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a
// Note: the following requires cuda library presence on linux to build, even if you only have rocm or CPU only
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a
@ -35,7 +37,7 @@ package llm
#cgo windows LDFLAGS: -lext_server_shared -lpthread #cgo windows LDFLAGS: -lext_server_shared -lpthread
#include <stdlib.h> #include <stdlib.h>
#include "examples/server/server.h" #include "server.h"
*/ */
import "C" import "C"
@ -43,25 +45,51 @@ import (
"bytes" "bytes"
"context" "context"
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"log" "log"
"os" "os"
"runtime" "runtime"
"strings"
"sync" "sync"
"time" "time"
"unsafe" "unsafe"
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/gpu"
) )
func errWrap(resp C.ext_server_err) error { func newExtServerResp(len C.size_t) C.ext_server_resp_t {
if resp.code == 0 { var resp C.ext_server_resp_t
return nil resp.msg_len = len
bytes := make([]byte, len)
resp.msg = (*C.char)(C.CBytes(bytes))
return resp
}
func freeExtServerResp(resp C.ext_server_resp_t) {
if resp.msg_len == 0 {
return
} }
err := fmt.Errorf(C.GoString(resp.err)) C.free(unsafe.Pointer(resp.msg))
C.free(unsafe.Pointer(resp.err)) }
return err
func extServerResponseToErr(resp C.ext_server_resp_t) error {
return fmt.Errorf(C.GoString(resp.msg))
}
type extServer interface {
LLM
llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
llama_server_start()
llama_server_stop()
llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
llama_server_release_task_result(result *C.ext_server_task_result_t)
llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_release_json_resp(json_resp **C.char)
} }
type llamaExtServer struct { type llamaExtServer struct {
@ -71,21 +99,61 @@ type llamaExtServer struct {
// Note: current implementation does not support concurrent instantiations // Note: current implementation does not support concurrent instantiations
var mutex sync.Mutex var mutex sync.Mutex
func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (*llamaExtServer, error) { func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
C.llama_server_init(sparams, err)
}
func (llm *llamaExtServer) llama_server_start() {
C.llama_server_start()
}
func (llm *llamaExtServer) llama_server_stop() {
C.llama_server_stop()
}
func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
C.llama_server_completion(json_req, resp)
}
func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
C.llama_server_completion_next_result(task_id, resp)
}
func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
C.llama_server_completion_cancel(task_id, err)
}
func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
C.llama_server_release_task_result(result)
}
func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_tokenize(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_detokenize(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_embedding(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
C.llama_server_release_json_resp(json_resp)
}
func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
server := &llamaExtServer{opts}
return newExtServer(server, model, adapters, projectors, numLayers, opts)
}
func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
if !mutex.TryLock() { if !mutex.TryLock() {
log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete") log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
mutex.Lock() mutex.Lock()
} }
server := &llamaExtServer{opts}
fileInfo, err := os.Stat(model) fileInfo, err := os.Stat(model)
if err != nil { if err != nil {
return nil, err return nil, err
} }
var sparams C.ext_server_params var sparams C.ext_server_params_t
sparams.model = C.CString(model) sparams.model = C.CString(model)
defer C.free(unsafe.Pointer(sparams.model)) defer C.free(unsafe.Pointer(sparams.model))
numGPU := NumGPU(numLayers, fileInfo.Size(), opts) numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
sparams.embedding = true sparams.embedding = true
sparams.n_ctx = C.uint(opts.NumCtx) sparams.n_ctx = C.uint(opts.NumCtx)
@ -97,10 +165,14 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in
// Always use the value encoded in the model // Always use the value encoded in the model
sparams.rope_freq_base = 0.0 sparams.rope_freq_base = 0.0
sparams.rope_freq_scale = 0.0 sparams.rope_freq_scale = 0.0
sparams.memory_f16 = C.bool(opts.F16KV)
sparams.use_mlock = C.bool(opts.UseMLock)
sparams.use_mmap = C.bool(opts.UseMMap)
sparams.numa = C.bool(opts.UseNUMA)
sparams.lora_adapters = nil sparams.lora_adapters = nil
for i := 0; i < len(adapters); i++ { for i := 0; i < len(adapters); i++ {
la := (*C.ext_server_lora_adapter)(C.malloc(C.sizeof_struct_ext_server_lora_adapter)) la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
defer C.free(unsafe.Pointer(la)) defer C.free(unsafe.Pointer(la))
la.adapter = C.CString(adapters[i]) la.adapter = C.CString(adapters[i])
defer C.free(unsafe.Pointer(la.adapter)) defer C.free(unsafe.Pointer(la.adapter))
@ -116,11 +188,13 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in
} }
} }
// TODO - implement ME if len(projectors) > 0 {
// if len(projectors) > 0 { // TODO: applying multiple projectors is not supported by the llama.cpp server yet
// // TODO: applying multiple projectors is not supported by the llama.cpp server yet sparams.mmproj = C.CString(projectors[0])
// params = append(params, "--mmproj", projectors[0]) defer C.free(unsafe.Pointer(sparams.mmproj))
// } } else {
sparams.mmproj = nil
}
if opts.NumThread > 0 { if opts.NumThread > 0 {
sparams.n_threads = C.uint(opts.NumThread) sparams.n_threads = C.uint(opts.NumThread)
@ -128,136 +202,167 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in
sparams.n_threads = C.uint(runtime.NumCPU()) sparams.n_threads = C.uint(runtime.NumCPU())
} }
sparams.memory_f16 = false
if opts.F16KV {
sparams.memory_f16 = true
}
sparams.use_mlock = false
if opts.UseMLock {
sparams.use_mlock = true
}
sparams.use_mmap = true
if !opts.UseMMap {
sparams.use_mmap = false
}
sparams.numa = false
if opts.UseNUMA {
sparams.numa = true
}
log.Printf("Initializing internal llama server") log.Printf("Initializing internal llama server")
err = errWrap(C.llama_server_init(&sparams)) resp := newExtServerResp(128)
if err != nil { defer freeExtServerResp(resp)
return nil, err server.llama_server_init(&sparams, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
} }
log.Printf("Starting internal llama main loop") log.Printf("Starting internal llama main loop")
C.llama_server_start() server.llama_server_start()
return server, nil return server, nil
} }
func (llm *llamaExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error { func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
return predict(llm, llm.Options, ctx, pred, fn)
}
func predict(llm extServer, opts api.Options, ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var imageData []ImageData
if len(predict.Images) > 0 {
for cnt, i := range predict.Images {
imageData = append(imageData, ImageData{Data: i, ID: cnt})
}
}
log.Printf("loaded %d images", len(imageData))
request := map[string]any{ request := map[string]any{
"prompt": predict.Prompt, "prompt": predict.Prompt,
"stream": true, "stream": true,
"n_predict": llm.NumPredict, "n_predict": opts.NumPredict,
"n_keep": llm.NumKeep, "n_keep": opts.NumKeep,
"temperature": llm.Temperature, "temperature": opts.Temperature,
"top_k": llm.TopK, "top_k": opts.TopK,
"top_p": llm.TopP, "top_p": opts.TopP,
"tfs_z": llm.TFSZ, "tfs_z": opts.TFSZ,
"typical_p": llm.TypicalP, "typical_p": opts.TypicalP,
"repeat_last_n": llm.RepeatLastN, "repeat_last_n": opts.RepeatLastN,
"repeat_penalty": llm.RepeatPenalty, "repeat_penalty": opts.RepeatPenalty,
"presence_penalty": llm.PresencePenalty, "presence_penalty": opts.PresencePenalty,
"frequency_penalty": llm.FrequencyPenalty, "frequency_penalty": opts.FrequencyPenalty,
"mirostat": llm.Mirostat, "mirostat": opts.Mirostat,
"mirostat_tau": llm.MirostatTau, "mirostat_tau": opts.MirostatTau,
"mirostat_eta": llm.MirostatEta, "mirostat_eta": opts.MirostatEta,
"penalize_nl": llm.PenalizeNewline, "penalize_nl": opts.PenalizeNewline,
"seed": llm.Seed, "seed": opts.Seed,
"stop": llm.Stop, "stop": opts.Stop,
"image_data": imageData,
} }
if predict.Format == "json" { if predict.Format == "json" {
request["grammar"] = jsonGrammar request["grammar"] = jsonGrammar
} }
// Handling JSON marshaling with special characters unescaped. retryDelay := 100 * time.Microsecond
buffer := &bytes.Buffer{} for retries := 0; retries < maxRetries; retries++ {
enc := json.NewEncoder(buffer) if retries > 0 {
enc.SetEscapeHTML(false) time.Sleep(retryDelay) // wait before retrying
retryDelay *= 2 // exponential backoff
}
if err := enc.Encode(request); err != nil { // Handling JSON marshaling with special characters unescaped.
return fmt.Errorf("failed to marshal data: %w", err) buffer := &bytes.Buffer{}
} enc := json.NewEncoder(buffer)
enc.SetEscapeHTML(false)
req := C.CString(buffer.String()) if err := enc.Encode(request); err != nil {
defer C.free(unsafe.Pointer(req)) return fmt.Errorf("failed to marshal data: %w", err)
}
cmpCtx := C.llama_server_completion(req) req := C.CString(buffer.String())
if cmpCtx.task_id < 0 { defer C.free(unsafe.Pointer(req))
defer C.free(unsafe.Pointer(cmpCtx.err))
return fmt.Errorf(C.GoString(cmpCtx.err))
}
for { llm.llama_server_completion(req, &resp)
select { if resp.id < 0 {
case <-ctx.Done(): return extServerResponseToErr(resp)
// This handles the request cancellation }
return errWrap(C.llama_server_completion_cancel(cmpCtx.task_id))
default:
result := C.llama_server_completion_next_result(cmpCtx.task_id)
if result.result_json != nil {
defer C.free(unsafe.Pointer(result.result_json))
}
var p prediction
if err := json.Unmarshal([]byte(C.GoString(result.result_json)), &p); err != nil {
err2 := errWrap(C.llama_server_completion_cancel(cmpCtx.task_id))
return errors.Join(fmt.Errorf("error unmarshaling llm prediction response: %w", err), err2)
}
if p.Content != "" { retryNeeded := false
fn(PredictResult{ out:
// Model: predict.Model, // XXX remove or replace? for {
CreatedAt: time.Now().UTC(), select {
Content: p.Content, case <-ctx.Done():
}) // This handles the request cancellation
} llm.llama_server_completion_cancel(resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
} else {
return nil
}
default:
var result C.ext_server_task_result_t
llm.llama_server_completion_next_result(resp.id, &result)
json_resp := C.GoString(result.json_resp)
llm.llama_server_release_task_result(&result)
if p.Stop { var p prediction
fn(PredictResult{ if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
// Model: predict.Model, // XXX remove or replace? llm.llama_server_completion_cancel(resp.id, &resp)
CreatedAt: time.Now().UTC(), if resp.id < 0 {
TotalDuration: time.Since(predict.CheckpointStart), return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
Done: true, } else {
PromptEvalCount: p.Timings.PromptN, return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS), }
EvalCount: p.Timings.PredictedN, }
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
}) if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
return nil retryNeeded = true
// task will already be canceled
break out
}
if p.Content != "" {
fn(PredictResult{
Content: p.Content,
})
}
if p.Stop {
fn(PredictResult{
Done: true,
PromptEvalCount: p.Timings.PromptN,
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
EvalCount: p.Timings.PredictedN,
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
})
return nil
}
} }
} }
if !retryNeeded {
return nil // success
}
} }
// should never reach here ideally
return fmt.Errorf("max retries exceeded")
}
func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
return encode(llm, ctx, prompt)
} }
func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
data, err := json.Marshal(TokenizeRequest{Content: prompt}) data, err := json.Marshal(TokenizeRequest{Content: prompt})
if err != nil { if err != nil {
return nil, fmt.Errorf("marshaling encode data: %w", err) return nil, fmt.Errorf("marshaling encode data: %w", err)
} }
req := C.CString(string(data)) req := C.CString(string(data))
defer C.free(unsafe.Pointer(req)) defer C.free(unsafe.Pointer(req))
var resp C.ext_server_resp var json_resp *C.char
err = errWrap(C.llama_server_tokenize(req, &resp)) resp := newExtServerResp(128)
if resp.json_resp != nil { defer freeExtServerResp(resp)
defer C.free(unsafe.Pointer(resp.json_resp)) llm.llama_server_tokenize(req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
} }
defer llm.llama_server_release_json_resp(&json_resp)
var encoded TokenizeResponse var encoded TokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &encoded); err2 != nil { if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
return nil, fmt.Errorf("unmarshal encode response: %w", err2) return nil, fmt.Errorf("unmarshal encode response: %w", err2)
} }
@ -265,6 +370,10 @@ func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, er
} }
func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) { func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
return decode(llm, ctx, tokens)
}
func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
if len(tokens) == 0 { if len(tokens) == 0 {
return "", nil return "", nil
} }
@ -275,14 +384,17 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er
req := C.CString(string(data)) req := C.CString(string(data))
defer C.free(unsafe.Pointer(req)) defer C.free(unsafe.Pointer(req))
var resp C.ext_server_resp var json_resp *C.char
err = errWrap(C.llama_server_detokenize(req, &resp)) resp := newExtServerResp(128)
if resp.json_resp != nil { defer freeExtServerResp(resp)
defer C.free(unsafe.Pointer(resp.json_resp)) llm.llama_server_detokenize(req, &json_resp, &resp)
if resp.id < 0 {
return "", extServerResponseToErr(resp)
} }
defer llm.llama_server_release_json_resp(&json_resp)
var decoded DetokenizeResponse var decoded DetokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &decoded); err2 != nil { if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
return "", fmt.Errorf("unmarshal encode response: %w", err2) return "", fmt.Errorf("unmarshal encode response: %w", err2)
} }
@ -290,6 +402,9 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er
} }
func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) { func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
return embedding(llm, ctx, input)
}
func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
data, err := json.Marshal(TokenizeRequest{Content: input}) data, err := json.Marshal(TokenizeRequest{Content: input})
if err != nil { if err != nil {
return nil, fmt.Errorf("error marshaling embed data: %w", err) return nil, fmt.Errorf("error marshaling embed data: %w", err)
@ -297,29 +412,28 @@ func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float
req := C.CString(string(data)) req := C.CString(string(data))
defer C.free(unsafe.Pointer(req)) defer C.free(unsafe.Pointer(req))
var resp C.ext_server_resp var json_resp *C.char
err = errWrap(C.llama_server_embedding(req, &resp)) resp := newExtServerResp(128)
if resp.json_resp != nil { defer freeExtServerResp(resp)
defer C.free(unsafe.Pointer(resp.json_resp)) llm.llama_server_embedding(req, &json_resp, &resp)
} if resp.id < 0 {
if err != nil { return nil, extServerResponseToErr(resp)
return nil, err
} }
defer llm.llama_server_release_json_resp(&json_resp)
var embedding EmbeddingResponse var embedding EmbeddingResponse
if err := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &embedding); err != nil { if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
return nil, fmt.Errorf("unmarshal tokenize response: %w", err) return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
} }
return embedding.Embedding, nil return embedding.Embedding, nil
} }
func (llm *llamaExtServer) Ping(ctx context.Context) error { func (llm *llamaExtServer) Close() {
// TODO - consider some mechanism to check if the main loop and llama.cpp are in a good state close(llm)
return nil
} }
func (llm *llamaExtServer) Close() { func close(llm extServer) {
C.llama_server_stop() llm.llama_server_stop()
mutex.Unlock() mutex.Unlock()
} }

View file

@ -1,57 +0,0 @@
//go:build linux || windows
package llm
import (
"errors"
"log"
"github.com/jmorganca/ollama/api"
)
/*
#cgo windows LDFLAGS: -L"/Program Files/NVIDIA Corporation/NVSMI/"
#cgo linux LDFLAGS: -lnvidia-ml
#include <stdlib.h>
#include "examples/server/server.h"
*/
import "C"
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) {
return int64(C.check_vram()), nil
}
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
if opts.NumGPU != -1 {
return opts.NumGPU
}
freeBytes, err := CheckVRAM()
if err != nil {
if !errors.Is(err, errNvidiaSMI) {
log.Print(err.Error())
}
// nvidia driver not installed or no nvidia GPU found
return 0
}
/*
Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
We can store the model weights and the kv cache in vram,
to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
*/
bytesPerLayer := fileSizeBytes / numLayer
// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
layers := int(freeBytes/bytesPerLayer) * 3 / 4
// TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU
// if int64(layers) < numLayer {
// log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024))
// return 0
// }
log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", freeBytes/(1024*1024), layers, numLayer)
return layers
}

View file

@ -1,10 +1,11 @@
# common logic accross linux and darwin # common logic accross linux and darwin
init_vars() { init_vars() {
LLAMACPP_DIR=gguf
PATCHES="0001-Expose-callable-API-for-server.patch" PATCHES="0001-Expose-callable-API-for-server.patch"
CMAKE_DEFS="-DLLAMA_ACCELERATE=on" CMAKE_DEFS="-DLLAMA_ACCELERATE=on"
# TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings
CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server" CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}" CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}"
else else
@ -29,6 +30,6 @@ apply_patches() {
} }
build() { build() {
cmake -S gguf -B ${BUILD_DIR} ${CMAKE_DEFS} cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
} }

View file

@ -1,4 +1,4 @@
#!/bin/sh #!/bin/bash
# This script is intended to run inside the go generate # This script is intended to run inside the go generate
# working directory must be ../llm/llama.cpp # working directory must be ../llm/llama.cpp
@ -30,6 +30,7 @@ git_module_setup
apply_patches apply_patches
build build
# TODO - improve this to handle test cases that need it to be in "." around the tree
# Enable local debug/run usecase # Enable local debug/run usecase
if [ -e "gguf/ggml-metal.metal" ]; then if [ -e "gguf/ggml-metal.metal" ]; then
cp gguf/ggml-metal.metal ../../ cp gguf/ggml-metal.metal ../../

View file

@ -1,17 +1,73 @@
#!/bin/sh #!/bin/bash
# This script is intended to run inside the go generate # This script is intended to run inside the go generate
# working directory must be ../llm/llama.cpp # working directory must be ../llm/llama.cpp
set -ex set -ex
set -o pipefail set -o pipefail
# TODO - stopped here - map the variables from above over and refine the case statement below
echo "Starting linux generate script" echo "Starting linux generate script"
if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ] ; then
export CUDACXX=/usr/local/cuda/bin/nvcc
fi
source $(dirname $0)/gen_common.sh source $(dirname $0)/gen_common.sh
init_vars init_vars
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="gguf/build/cuda"
git_module_setup git_module_setup
apply_patches apply_patches
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="gguf/build/cuda"
LIB_DIR="${BUILD_DIR}/lib"
mkdir -p ../../dist/
build build
# TODO - explore mechanism to soften the hard cuda dependency on linux
# by conditionally building some archive here that aggregates the cuda libs if present
# so that the cgo flags link this intermediate archive instead of the underlying cuda libs
#
# gcc -fPIC -g -shared -o ${LIB_DIR}/libcuda_server.so \
# -Wl,--whole-archive \
# ${BUILD_DIR}/examples/server/CMakeFiles/ext_server.dir/server.cpp.o \
# ${BUILD_DIR}/common/libcommon.a \
# ${BUILD_DIR}/libllama.a \
# ${BUILD_DIR}/examples/llava/libllava_static.a \
# -Wl,--no-whole-archive \
# -lrt -lpthread -ldl -lstdc++ -lm \
# /usr/local/cuda/lib64/libcudart_static.a \
# /usr/local/cuda/lib64/libcublas_static.a \
# /usr/local/cuda/lib64/libcublasLt_static.a \
# /usr/local/cuda/lib64/libcudadevrt.a \
# /usr/local/cuda/lib64/libculibos.a
if [ -z "${ROCM_PATH}" ] ; then
# Try the default location in case it exists
ROCM_PATH=/opt/rocm
fi
if [ -z "${CLBlast_DIR}" ] ; then
# Try the default location in case it exists
if [ -d /usr/lib/cmake/CLBlast ]; then
export CLBlast_DIR=/usr/lib/cmake/CLBlast
fi
fi
BUILD_DIR="gguf/build/rocm"
LIB_DIR="${BUILD_DIR}/lib"
mkdir -p ${LIB_DIR}
# Ensure we have at least one file present for the embed
touch ${LIB_DIR}/.generated
if [ -d "${ROCM_PATH}" ] ; then
echo "Building ROCm"
init_vars
CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
CMAKE_DEFS="-DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
build
gcc -fPIC -g -shared -o ${LIB_DIR}/librocm_server.so \
-Wl,--whole-archive \
${BUILD_DIR}/examples/server/libext_server.a \
${BUILD_DIR}/common/libcommon.a \
${BUILD_DIR}/libllama.a \
-Wl,--no-whole-archive \
-lrt -lpthread -ldl -lstdc++ -lm \
-L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \
-Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \
-lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu
fi

View file

@ -48,4 +48,8 @@ init_vars
git_module_setup git_module_setup
apply_patches apply_patches
build build
install install
# TODO - implement ROCm support on windows
md gguf/build/winrocm/lib -ea 0
echo $null >> gguf/build/winrocm/lib/.generated

View file

@ -1,3 +1,3 @@
package llm package llm
//go:generate sh ./gen_linux.sh //go:generate bash ./gen_linux.sh

View file

@ -1,24 +0,0 @@
//go:build cuda
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate rm -rf ggml/build/cuda
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate rm -rf gguf/build/cuda
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
//go:generate cmake --build gguf/build/cuda --target server --config Release
//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner

View file

@ -1,25 +0,0 @@
//go:build rocm
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate rm -rf ggml/build/rocm
//go:generate cmake -S ggml -B ggml/build/rocm -DLLAMA_CLBLAST=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/rocm --target server --config Release
//go:generate mv ggml/build/rocm/bin/server ggml/build/rocm/bin/ollama-runner
//go:generate rm -rf gguf/build/rocm
//go:generate cmake -S gguf -B gguf/build/rocm -DLLAMA_HIPBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'
//go:generate cmake --build gguf/build/rocm --target server --config Release
//go:generate mv gguf/build/rocm/bin/server gguf/build/rocm/bin/ollama-runner

View file

@ -1,15 +1,15 @@
From 64b3fbb150d12b3ca63ac2fb4e57bc46f41d2ccd Mon Sep 17 00:00:00 2001 From 087cf3300e973d7790db8f7cad01d2a790de38be Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com> From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 13 Nov 2023 12:25:58 -0800 Date: Mon, 13 Nov 2023 12:25:58 -0800
Subject: [PATCH] Expose callable API for server Subject: [PATCH] Expose callable API for server
This adds an extern "C" interface within the example server This adds an extern "C" interface within the example server
--- ---
examples/server/CMakeLists.txt | 24 ++++ examples/server/CMakeLists.txt | 24 +++
examples/server/server.cpp | 247 +++++++++++++++++++++++++++++++++ examples/server/server.cpp | 274 +++++++++++++++++++++++++++++++++
examples/server/server.h | 83 +++++++++++ examples/server/server.h | 89 +++++++++++
ggml-cuda.cu | 1 + ggml-cuda.cu | 1 +
4 files changed, 355 insertions(+) 4 files changed, 388 insertions(+)
create mode 100644 examples/server/server.h create mode 100644 examples/server/server.h
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644
+endif() +endif()
\ No newline at end of file \ No newline at end of file
diff --git a/examples/server/server.cpp b/examples/server/server.cpp diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 895f751..f939590 100644 index d0cd8e1..5f5d4c5 100644
--- a/examples/server/server.cpp --- a/examples/server/server.cpp
+++ b/examples/server/server.cpp +++ b/examples/server/server.cpp
@@ -5,6 +5,9 @@ @@ -5,6 +5,9 @@
@ -59,7 +59,7 @@ index 895f751..f939590 100644
#ifndef NDEBUG #ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error // crash the server in debug mode, otherwise send an http 500 error
@@ -2631,6 +2634,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con @@ -2632,6 +2635,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
} }
} }
@ -67,84 +67,84 @@ index 895f751..f939590 100644
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
// own arguments required by this example // own arguments required by this example
@@ -3065,3 +3069,246 @@ int main(int argc, char **argv) @@ -3066,3 +3070,273 @@ int main(int argc, char **argv)
llama_backend_free(); llama_backend_free();
return 0; return 0;
} }
+ +
+#else // LLAMA_SERVER_LIBRARY +#else // LLAMA_SERVER_LIBRARY
+// Expose the llama server as a callable extern "C" API +// Expose the llama server as a callable extern "C" API
+llama_server_context llama; +llama_server_context *llama = NULL;
+std::atomic<bool> ext_server_running(false); +std::atomic<bool> ext_server_running(false);
+std::thread ext_server_thread; +std::thread ext_server_thread;
+inline ext_server_err makeErr(uint32_t code, std::string msg) {
+ if (code == 0) {
+ return ext_server_err{0, NULL};
+ }
+ const std::string::size_type size = msg.size();
+ ext_server_err ret = {
+ code,
+ new char[size + 1],
+ };
+ memcpy(ret.err, msg.c_str(), size + 1);
+ return ret;
+}
+ +
+ext_server_err llama_server_init(ext_server_params *sparams) +void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err)
+{ +{
+ log_set_target(stdout); + assert(err != NULL && sparams != NULL);
+ gpt_params params; + err->id = 0;
+ params.n_ctx = sparams->n_ctx; + err->msg[0] = '\0';
+ params.n_batch = sparams->n_batch;
+ params.n_threads = sparams->n_threads;
+ params.n_parallel = sparams->n_parallel;
+ params.rope_freq_base = sparams->rope_freq_base;
+ params.rope_freq_scale = sparams->rope_freq_scale;
+
+ if (sparams->memory_f16) {
+ params.cache_type_k = "f16";
+ params.cache_type_v = "f16";
+ } else {
+ params.cache_type_k = "f32";
+ params.cache_type_v = "f32";
+ }
+
+ params.n_gpu_layers = sparams->n_gpu_layers;
+ params.main_gpu = sparams->main_gpu;
+ params.use_mlock = sparams->use_mlock;
+ params.use_mmap = sparams->use_mmap;
+ params.numa = sparams->numa;
+ params.embedding = sparams->embedding;
+ if (sparams->model != NULL) {
+ params.model = sparams->model;
+ }
+
+ for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; la = la->next) {
+ params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
+ }
+
+ try { + try {
+ llama = new llama_server_context;
+ log_set_target(stdout);
+ gpt_params params;
+ params.n_ctx = sparams->n_ctx;
+ params.n_batch = sparams->n_batch;
+ params.n_threads = sparams->n_threads;
+ params.n_parallel = sparams->n_parallel;
+ params.rope_freq_base = sparams->rope_freq_base;
+ params.rope_freq_scale = sparams->rope_freq_scale;
+
+ if (sparams->memory_f16) {
+ params.cache_type_k = "f16";
+ params.cache_type_v = "f16";
+ } else {
+ params.cache_type_k = "f32";
+ params.cache_type_v = "f32";
+ }
+
+ params.n_gpu_layers = sparams->n_gpu_layers;
+ params.main_gpu = sparams->main_gpu;
+ params.use_mlock = sparams->use_mlock;
+ params.use_mmap = sparams->use_mmap;
+ params.numa = sparams->numa;
+ params.embedding = sparams->embedding;
+ if (sparams->model != NULL) {
+ params.model = sparams->model;
+ }
+
+ for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; la = la->next) {
+ params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
+ }
+
+ if (sparams->mmproj != NULL) {
+ params.mmproj = std::string(sparams->mmproj);
+ }
+
+ llama_backend_init(params.numa); + llama_backend_init(params.numa);
+ +
+ // load the model + // load the model
+ if (!llama.load_model(params)) + if (!llama->load_model(params))
+ { + {
+ // TODO - consider modifying the logging logic or patching load_model so we can capture more detailed error messages + // TODO - consider modifying the logging logic or patching load_model so we can capture more detailed error messages
+ // and pass them back to the caller for better UX + // and pass them back to the caller for better UX
+ return makeErr(1, "error loading model " + params.model); + err->id = -1;
+ snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
+ return;
+ } + }
+ +
+ llama.initialize(); + llama->initialize();
+ } catch (std::exception &e) { + } catch (std::exception &e) {
+ return makeErr(1, e.what()); + err->id = -1;
+ snprintf(err->msg, err->msg_len, "exception %s", e.what());
+ } catch (...) { + } catch (...) {
+ return makeErr(1, "Unknown Exception initializing llama server"); + err->id = -1;
+ snprintf(err->msg, err->msg_len, "Unknown exception initializing llama server");
+ } + }
+ return makeErr(0, "");
+} +}
+ +
+void llama_server_start() +void llama_server_start()
+{ +{
+ assert(llama != NULL);
+ // TODO mutex to protect thread creation + // TODO mutex to protect thread creation
+ ext_server_thread = std::thread([&]() + ext_server_thread = std::thread([&]()
+ { + {
@ -154,7 +154,7 @@ index 895f751..f939590 100644
+ ggml_time_init(); + ggml_time_init();
+ while (ext_server_running.load()) + while (ext_server_running.load())
+ { + {
+ if (!llama.update_slots()) { + if (!llama->update_slots()) {
+ LOG_TEE("unexpected error in llama server update_slots - exiting main loop\n"); + LOG_TEE("unexpected error in llama server update_slots - exiting main loop\n");
+ break; + break;
+ } + }
@ -170,124 +170,150 @@ index 895f751..f939590 100644
+} +}
+ +
+void llama_server_stop() { +void llama_server_stop() {
+ assert(llama != NULL);
+ // TODO - too verbose, remove once things are solid + // TODO - too verbose, remove once things are solid
+ LOG_TEE("requesting llama server shutdown\n"); + LOG_TEE("requesting llama server shutdown\n");
+ ext_server_running = false; + ext_server_running = false;
+ ext_server_thread.join(); + ext_server_thread.join();
+ delete llama;
+ llama = NULL;
+ LOG_TEE("llama server shutdown complete\n"); + LOG_TEE("llama server shutdown complete\n");
+} +}
+ +
+ext_server_completion_resp llama_server_completion(const char *json_req) { +void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
+ std::string msg; + assert(llama != NULL && json_req != NULL && resp != NULL);
+ ext_server_completion_resp resp = { + resp->id = -1;
+ 0, + resp->msg[0] = '\0';
+ NULL,
+ };
+ try { + try {
+ json data = json::parse(json_req); + json data = json::parse(json_req);
+ resp.task_id = llama.request_completion(data, false, false, -1); + resp->id = llama->request_completion(data, false, false, -1);
+ return resp;
+ } catch (std::exception &e) { + } catch (std::exception &e) {
+ msg = e.what(); + snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
+ } catch (...) { + } catch (...) {
+ msg = "Unknown Exception during completion"; + snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
+ } + }
+ const std::string::size_type size = msg.size();
+ resp.task_id = 0;
+ resp.err = new char[size + 1];
+ memcpy(resp.err, msg.c_str(), size + 1);
+ return resp;
+} +}
+ +
+ext_task_result llama_server_completion_next_result(const int task_id) { +void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *resp) {
+ assert(llama != NULL && resp != NULL);
+ std::string msg; + std::string msg;
+ ext_task_result resp = {-1,false,false,NULL}; + resp->id = -1;
+ try { + resp->stop = false;
+ task_result result = llama.next_result(task_id); + resp->error = false;
+ std::string result_json = result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
+ const std::string::size_type size = result_json.size();
+ resp.id = result.id;
+ resp.stop = result.stop;
+ resp.error = result.error;
+ resp.result_json = new char[size + 1];
+ memcpy(resp.result_json, result_json.c_str(), size + 1);
+ if (result.error) {
+ llama.request_cancel(task_id);
+ } else if (result.stop) {
+ llama.request_cancel(task_id);
+ }
+ return resp;
+ } catch (std::exception &e) {
+ msg = e.what(); // TODO - json?
+ } catch (...) {
+ msg = "Unknown Exception during completion";
+ }
+ resp.error = true;
+ const std::string::size_type size = msg.size();
+ resp.result_json = new char[size + 1];
+ memcpy(resp.result_json, msg.c_str(), size + 1);
+ return resp;
+}
+
+ext_server_err llama_server_completion_cancel(const int task_id) {
+ try {
+ llama.request_cancel(task_id);
+ } catch (std::exception &e) {
+ return makeErr(1, e.what());
+ } catch (...) {
+ return makeErr(1, "Unknown Exception running llama server");
+ }
+ return makeErr(0, "");
+}
+
+
+ext_server_err llama_server_tokenize(const char *json_req, ext_server_resp *resp) {
+ resp->json_resp = NULL; + resp->json_resp = NULL;
+ std::string result_json;
+ try {
+ task_result result = llama->next_result(task_id);
+ result_json = result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
+ resp->id = result.id;
+ resp->stop = result.stop;
+ resp->error = result.error;
+ if (result.error) {
+ llama->request_cancel(task_id);
+ } else if (result.stop) {
+ llama->request_cancel(task_id);
+ }
+ } catch (std::exception &e) {
+ resp->error = true;
+ resp->id = -1;
+ result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
+ } catch (...) {
+ resp->error = true;
+ resp->id = -1;
+ result_json = "{\"error\":\"Unknown exception during completion\"}";
+ }
+ const std::string::size_type size = result_json.size() + 1;
+ resp->json_resp = new char[size];
+ snprintf(resp->json_resp, size, "%s", result_json.c_str());
+}
+
+void llama_server_release_task_result(ext_server_task_result_t *result) {
+ if (result == NULL || result->json_resp == NULL) {
+ return;
+ }
+ delete[] result->json_resp;
+}
+
+void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
+ assert(llama != NULL && err != NULL);
+ err->id = 0;
+ err->msg[0] = '\0';
+ try {
+ llama->request_cancel(task_id);
+ } catch (std::exception &e) {
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "exception %s", e.what());
+ } catch (...) {
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "Unknown exception completion cancel in llama server");
+ }
+}
+
+void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) {
+ assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+ *json_resp = NULL;
+ err->id = 0;
+ err->msg[0] = '\0';
+ try { + try {
+ const json body = json::parse(json_req); + const json body = json::parse(json_req);
+ std::vector<llama_token> tokens; + std::vector<llama_token> tokens;
+ if (body.count("content") != 0) + if (body.count("content") != 0)
+ { + {
+ tokens = llama.tokenize(body["content"], false); + tokens = llama->tokenize(body["content"], false);
+ } + }
+ const json data = format_tokenizer_response(tokens); + const json data = format_tokenizer_response(tokens);
+ std::string result_json = data.dump(); + std::string result_json = data.dump();
+ const std::string::size_type size = result_json.size(); + const std::string::size_type size = result_json.size() + 1;
+ resp->json_resp = new char[size + 1]; + *json_resp = new char[size];
+ memcpy(resp->json_resp, result_json.c_str(), size + 1); + snprintf(*json_resp, size, "%s", result_json.c_str());
+ } catch (std::exception &e) { + } catch (std::exception &e) {
+ return makeErr(1, e.what()); + err->id = -1;
+ snprintf(err->msg, err->msg_len, "exception %s", e.what());
+ } catch (...) { + } catch (...) {
+ return makeErr(1, "Unknown Exception during tokenize"); + err->id = -1;
+ snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
+ } + }
+ return makeErr(0, "");
+} +}
+ +
+ext_server_err llama_server_detokenize(const char *json_req, ext_server_resp *resp) { +void llama_server_release_json_resp(char **json_resp) {
+ resp->json_resp = NULL; + if (json_resp == NULL || *json_resp == NULL) {
+ return;
+ }
+ delete[] *json_resp;
+}
+
+void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) {
+ assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+ *json_resp = NULL;
+ err->id = 0;
+ err->msg[0] = '\0';
+ try { + try {
+ const json body = json::parse(json_req); + const json body = json::parse(json_req);
+ std::string content; + std::string content;
+ if (body.count("tokens") != 0) + if (body.count("tokens") != 0)
+ { + {
+ const std::vector<llama_token> tokens = body["tokens"]; + const std::vector<llama_token> tokens = body["tokens"];
+ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend()); + content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
+ } + }
+ const json data = format_detokenized_response(content); + const json data = format_detokenized_response(content);
+ std::string result_json = data.dump(); + std::string result_json = data.dump();
+ const std::string::size_type size = result_json.size(); + const std::string::size_type size = result_json.size() + 1;
+ resp->json_resp = new char[size + 1]; + *json_resp = new char[size];
+ memcpy(resp->json_resp, result_json.c_str(), size + 1); + snprintf(*json_resp, size, "%s", result_json.c_str());
+ } catch (std::exception &e) { + } catch (std::exception &e) {
+ return makeErr(1, e.what()); + err->id = -1;
+ snprintf(err->msg, err->msg_len, "exception %s", e.what());
+ } catch (...) { + } catch (...) {
+ return makeErr(1, "Unknown Exception during detokenize"); + err->id = -1;
+ snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
+ } + }
+ return makeErr(0, "");
+} +}
+ +
+ext_server_err llama_server_embedding(const char *json_req, ext_server_resp *resp) { +void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err) {
+ resp->json_resp = NULL; + assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+ *json_resp = NULL;
+ err->id = 0;
+ err->msg[0] = '\0';
+ try { + try {
+ const json body = json::parse(json_req); + const json body = json::parse(json_req);
+ json prompt; + json prompt;
@ -299,28 +325,29 @@ index 895f751..f939590 100644
+ { + {
+ prompt = ""; + prompt = "";
+ } + }
+ const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); + const int task_id = llama->request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
+ task_result result = llama.next_result(task_id); + task_result result = llama->next_result(task_id);
+ std::string result_json = result.result_json.dump(); + std::string result_json = result.result_json.dump();
+ const std::string::size_type size = result_json.size(); + const std::string::size_type size = result_json.size() + 1;
+ resp->json_resp = new char[size + 1]; + *json_resp = new char[size];
+ memcpy(resp->json_resp, result_json.c_str(), size + 1); + snprintf(*json_resp, size, "%s", result_json.c_str());
+ } catch (std::exception &e) { + } catch (std::exception &e) {
+ return makeErr(1, e.what()); + err->id = -1;
+ snprintf(err->msg, err->msg_len, "exception %s", e.what());
+ } catch (...) { + } catch (...) {
+ return makeErr(1, "Unknown Exception during detokenize"); + err->id = -1;
+ snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
+ } + }
+ return makeErr(0, "");
+} +}
+ +
+#endif // LLAMA_SERVER_LIBRARY +#endif // LLAMA_SERVER_LIBRARY
\ No newline at end of file \ No newline at end of file
diff --git a/examples/server/server.h b/examples/server/server.h diff --git a/examples/server/server.h b/examples/server/server.h
new file mode 100644 new file mode 100644
index 0000000..4d03b1e index 0000000..d22f1b6
--- /dev/null --- /dev/null
+++ b/examples/server/server.h +++ b/examples/server/server.h
@@ -0,0 +1,83 @@ @@ -0,0 +1,89 @@
+#if defined(LLAMA_SERVER_LIBRARY) +#if defined(LLAMA_SERVER_LIBRARY)
+#ifndef LLAMA_SERVER_H +#ifndef LLAMA_SERVER_H
+#define LLAMA_SERVER_H +#define LLAMA_SERVER_H
@ -336,20 +363,23 @@ index 0000000..4d03b1e
+extern "C" +extern "C"
+{ +{
+#endif +#endif
+ // TODO - clean the type def's up a bit for better consistency + typedef struct ext_server_resp {
+ typedef struct ext_server_err { + int id; // < 0 on error
+ uint32_t code; // 0 on success, > 0 on error + size_t msg_len; // caller must allocate msg and set msg_len
+ char *err; // null if code == 0; else contains error message. Caller responsible for freeing memory + char *msg;
+ } ext_server_err; + } ext_server_resp_t;
+ +
+ // Allocated and freed by caller
+ typedef struct ext_server_lora_adapter { + typedef struct ext_server_lora_adapter {
+ char *adapter; + char *adapter;
+ float scale; + float scale;
+ struct ext_server_lora_adapter *next; + struct ext_server_lora_adapter *next;
+ } ext_server_lora_adapter; + } ext_server_lora_adapter_t;
+
+ // Allocated and freed by caller
+ typedef struct ext_server_params + typedef struct ext_server_params
+ { + {
+ char *model; + char *model;
+ uint32_t n_ctx; // text context, 0 = from model + uint32_t n_ctx; // text context, 0 = from model
+ uint32_t n_batch; // prompt processing maximum batch size + uint32_t n_batch; // prompt processing maximum batch size
+ uint32_t n_threads; // number of threads to use for generation + uint32_t n_threads; // number of threads to use for generation
@ -363,40 +393,43 @@ index 0000000..4d03b1e
+ bool use_mmap; // use mmap if possible + bool use_mmap; // use mmap if possible
+ bool numa; // attempt optimizations that help on some NUMA systems + bool numa; // attempt optimizations that help on some NUMA systems
+ bool embedding; // get only sentence embedding + bool embedding; // get only sentence embedding
+ ext_server_lora_adapter* lora_adapters; + ext_server_lora_adapter_t* lora_adapters;
+ } ext_server_params; + char *mmproj;
+ } ext_server_params_t;
+ +
+ // Initialize the server once per process + typedef struct ext_server_task_result
+ ext_server_err llama_server_init(ext_server_params *sparams);
+
+ // Run the main loop
+ void llama_server_start();
+ // Stop the main loop
+ void llama_server_stop();
+
+ typedef struct ext_task_result
+ { + {
+ int id; + int id;
+ bool stop; + bool stop;
+ bool error; + bool error;
+ char* result_json; // caller responsible to free this memory + char* json_resp; // null terminated, memory managed by ext_server
+ } ext_task_result; + } ext_server_task_result_t;
+
+ typedef struct ext_server_completion_resp {
+ int task_id; // < 0 on error, >= 0 on success
+ char *err; // null if task_id >= 0; else contains error message. Caller responsible for freeing memory
+ } ext_server_completion_resp;
+ ext_server_completion_resp llama_server_completion(const char *json_req);
+ ext_task_result llama_server_completion_next_result(const int task_id);
+ ext_server_err llama_server_completion_cancel(const int task_id);
+ +
+ // Caller responsible for freeing json_resp + // Initialize the server once per process
+ typedef struct ext_server_resp { + // err->id = 0 for success and err->msg[0] = NULL
+ char *json_resp; // Caller responsible for freeing string + // err->id != 0 for failure, and err->msg contains error message
+ } ext_server_resp; + void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
+ ext_server_err llama_server_tokenize(const char *json_req, ext_server_resp *resp); +
+ ext_server_err llama_server_detokenize(const char *json_req, ext_server_resp *resp); + // Run the main loop, called once per init
+ ext_server_err llama_server_embedding(const char *json_req, ext_server_resp *resp); + void llama_server_start();
+ // Stop the main loop and free up resources allocated in init and start. Init must be called again to reuse
+ void llama_server_stop();
+
+ // json_req null terminated string, memory managed by caller
+ // resp->id >= 0 on success (task ID)
+ // resp->id < 0 on error, and resp->msg contains error message
+ void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
+
+ // Caller must call llama_server_release_task_result to free resp->json_resp
+ void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *result);
+ void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
+ void llama_server_release_task_result(ext_server_task_result_t *result);
+
+ // Caller must call llama_server_releaes_json_resp to free json_resp if err.id < 0
+ void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err);
+ void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err);
+ void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err);
+ void llama_server_release_json_resp(char **json_resp);
+ +
+#ifdef __cplusplus +#ifdef __cplusplus
+} +}
@ -406,10 +439,10 @@ index 0000000..4d03b1e
+#endif // LLAMA_SERVER_LIBRARY +#endif // LLAMA_SERVER_LIBRARY
\ No newline at end of file \ No newline at end of file
diff --git a/ggml-cuda.cu b/ggml-cuda.cu diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 85f7a29..ce51364 100644 index 9e1acd3..ea64b55 100644
--- a/ggml-cuda.cu --- a/ggml-cuda.cu
+++ b/ggml-cuda.cu +++ b/ggml-cuda.cu
@@ -6410,6 +6410,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( @@ -6505,6 +6505,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
CUDA_CHECK(cudaGetDevice(&id)); CUDA_CHECK(cudaGetDevice(&id));
src_ptr = (char *) extra->data_device[id]; src_ptr = (char *) extra->data_device[id];
} else { } else {

View file

@ -3,6 +3,7 @@ package llm
import ( import (
"bytes" "bytes"
"context" "context"
_ "embed"
"errors" "errors"
"fmt" "fmt"
"os" "os"
@ -112,12 +113,6 @@ type ImageData struct {
ID int `json:"id"` ID int `json:"id"`
} }
type llama struct {
api.Options
ImageData []ImageData
Running
}
var ( var (
errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed") errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only") errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
@ -166,7 +161,8 @@ type prediction struct {
} }
const maxBufferSize = 512 * format.KiloByte const maxBufferSize = 512 * format.KiloByte
const maxRetries = 6 const maxRetries = 3
const retryDelay = 1 * time.Second
type PredictOpts struct { type PredictOpts struct {
Prompt string Prompt string

View file

@ -11,6 +11,7 @@ import (
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/format" "github.com/jmorganca/ollama/format"
"github.com/jmorganca/ollama/gpu"
) )
type LLM interface { type LLM interface {
@ -19,7 +20,6 @@ type LLM interface {
Encode(context.Context, string) ([]int, error) Encode(context.Context, string) ([]int, error)
Decode(context.Context, []int) (string, error) Decode(context.Context, []int) (string, error)
Close() Close()
Ping(context.Context) error
} }
func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) { func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
@ -78,5 +78,17 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
opts.NumGQA = 0 opts.NumGQA = 0
opts.RopeFrequencyBase = 0.0 opts.RopeFrequencyBase = 0.0
opts.RopeFrequencyScale = 0.0 opts.RopeFrequencyScale = 0.0
return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts) gpuInfo := gpu.GetGPUInfo()
switch gpuInfo.Driver {
case "ROCM":
return newRocmShimExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
default:
// Rely on the built-in CUDA based server which will fall back to CPU
return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
}
}
// Give any native cgo implementations an opportunity to initialize
func Init(workdir string) error {
return nativeInit(workdir)
} }

134
llm/rocm_shim.c Normal file
View file

@ -0,0 +1,134 @@
#include "rocm_shim.h"
#include <stdio.h>
#include <string.h>
#ifndef _WIN32
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() dlerror()
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#else
#include <windows.h>
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
// TODO - refactor this with proper error message handling on windows
inline static char *LOAD_ERR() {
static char errbuf[8];
snprintf(errbuf, 8, "0x%lx", GetLastError());
return errbuf;
}
#endif
void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
ext_server_resp_t *err) {
int i = 0;
struct lookup {
char *s;
void **p;
} l[] = {
{"llama_server_init", (void *)&s->llama_server_init},
{"llama_server_start", (void *)&s->llama_server_start},
{"llama_server_stop", (void *)&s->llama_server_stop},
{"llama_server_completion", (void *)&s->llama_server_completion},
{"llama_server_completion_next_result",
(void *)&s->llama_server_completion_next_result},
{"llama_server_completion_cancel",
(void *)&s->llama_server_completion_cancel},
{"llama_server_release_task_result",
(void *)&s->llama_server_release_task_result},
{"llama_server_tokenize", (void *)&s->llama_server_tokenize},
{"llama_server_detokenize", (void *)&s->llama_server_detokenize},
{"llama_server_embedding", (void *)&s->llama_server_embedding},
{"llama_server_release_json_resp",
(void *)&s->llama_server_release_json_resp},
{"", NULL},
};
printf("Lazy loading %s library\n", libPath);
s->handle = LOAD_LIBRARY(libPath, RTLD_LAZY);
if (!s->handle) {
err->id = -1;
snprintf(
err->msg, err->msg_len,
"Unable to load rocm server library: %s (If you have a Radeon card, "
"did you install the ROCM libraries?)",
LOAD_ERR());
return;
}
for (i = 0; l[i].p != NULL; i++) {
*l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(s->handle);
err->id = -1;
snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
l[i].s, LOAD_ERR());
return;
}
}
}
inline void rocm_shim_llama_server_init(struct rocm_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err) {
s.llama_server_init(sparams, err);
}
inline void rocm_shim_llama_server_start(struct rocm_llama_server s) {
s.llama_server_start();
}
inline void rocm_shim_llama_server_stop(struct rocm_llama_server s) {
s.llama_server_stop();
}
inline void rocm_shim_llama_server_completion(struct rocm_llama_server s,
const char *json_req,
ext_server_resp_t *resp) {
s.llama_server_completion(json_req, resp);
}
inline void rocm_shim_llama_server_completion_next_result(
struct rocm_llama_server s, const int task_id,
ext_server_task_result_t *result) {
s.llama_server_completion_next_result(task_id, result);
}
inline void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s,
const int task_id,
ext_server_resp_t *err) {
s.llama_server_completion_cancel(task_id, err);
}
inline void rocm_shim_llama_server_release_task_result(
struct rocm_llama_server s, ext_server_task_result_t *result) {
s.llama_server_release_task_result(result);
}
inline void rocm_shim_llama_server_tokenize(struct rocm_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_tokenize(json_req, json_resp, err);
}
inline void rocm_shim_llama_server_detokenize(struct rocm_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_detokenize(json_req, json_resp, err);
}
inline void rocm_shim_llama_server_embedding(struct rocm_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_embedding(json_req, json_resp, err);
}
inline void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s,
char **json_resp) {
s.llama_server_release_json_resp(json_resp);
}

73
llm/rocm_shim.h Normal file
View file

@ -0,0 +1,73 @@
#include <stdlib.h>
#include "server.h"
#ifdef __cplusplus
extern "C" {
#endif
struct rocm_llama_server {
void *handle;
void (*llama_server_init)(ext_server_params_t *sparams,
ext_server_resp_t *err);
void (*llama_server_start)();
void (*llama_server_stop)();
void (*llama_server_completion)(const char *json_req,
ext_server_resp_t *resp);
void (*llama_server_completion_next_result)(const int task_id,
ext_server_task_result_t *result);
void (*llama_server_completion_cancel)(const int task_id,
ext_server_resp_t *err);
void (*llama_server_release_task_result)(ext_server_task_result_t *result);
void (*llama_server_tokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_detokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_embedding)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_release_json_resp)(char **json_resp);
};
void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
ext_server_resp_t *err);
// No good way to call C function pointers from Go so inline the indirection
void rocm_shim_llama_server_init(struct rocm_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err);
void rocm_shim_llama_server_start(struct rocm_llama_server s);
void rocm_shim_llama_server_stop(struct rocm_llama_server s);
void rocm_shim_llama_server_completion(struct rocm_llama_server s,
const char *json_req,
ext_server_resp_t *resp);
void rocm_shim_llama_server_completion_next_result(
struct rocm_llama_server s, const int task_id,
ext_server_task_result_t *result);
void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s,
const int task_id,
ext_server_resp_t *err);
void rocm_shim_llama_server_release_task_result(
struct rocm_llama_server s, ext_server_task_result_t *result);
void rocm_shim_llama_server_tokenize(struct rocm_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void rocm_shim_llama_server_detokenize(struct rocm_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void rocm_shim_llama_server_embedding(struct rocm_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s,
char **json_resp);
#ifdef __cplusplus
}
#endif

18
llm/shim_darwin.go Normal file
View file

@ -0,0 +1,18 @@
package llm
import (
"fmt"
"github.com/jmorganca/ollama/api"
)
// no-op stubs for mac
func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
// should never happen...
return nil, fmt.Errorf("ROCM GPUs not supported on Mac")
}
func nativeInit(workDir string) error {
return nil
}

212
llm/shim_ext_server.go Normal file
View file

@ -0,0 +1,212 @@
//go:build !darwin
package llm
/*
#include <stdlib.h>
#include "rocm_shim.h"
*/
import "C"
import (
"context"
"embed"
"errors"
"fmt"
"io"
"io/fs"
"log"
"os"
"path/filepath"
"runtime"
"sync"
"unsafe"
"github.com/jmorganca/ollama/api"
)
//go:embed llama.cpp/gguf/build/*/lib/*
var libEmbed embed.FS
var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported")
var NoShim = true
type shimExtServer struct {
s C.struct_rocm_llama_server
options api.Options
}
// Note: current implementation does not support concurrent instantiations
var shimMutex sync.Mutex
var llm *shimExtServer
func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_init(llm.s, sparams, err)
}
func (llm *shimExtServer) llama_server_start() {
C.rocm_shim_llama_server_start(llm.s)
}
func (llm *shimExtServer) llama_server_stop() {
C.rocm_shim_llama_server_stop(llm.s)
}
func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
C.rocm_shim_llama_server_completion(llm.s, json_req, resp)
}
func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp)
}
func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err)
}
func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
C.rocm_shim_llama_server_release_task_result(llm.s, result)
}
func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp)
}
func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
if NoShim {
return nil, RocmShimMissing
}
log.Printf("Loading ROCM llm server")
if llm == nil {
return nil, fmt.Errorf("nativeInit wasnt called or libary load failed")
}
llm.options = opts
return newExtServer(llm, model, adapters, projectors, numLayers, opts)
}
func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
return predict(llm, llm.options, ctx, pred, fn)
}
func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
return encode(llm, ctx, prompt)
}
func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
return decode(llm, ctx, tokens)
}
func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
return embedding(llm, ctx, input)
}
func (llm *shimExtServer) Close() {
close(llm)
}
func nativeInit(workdir string) error {
err := extractLib(workdir)
if err != nil {
if err == RocmShimMissing {
log.Printf("%s", err)
return nil
}
return err
}
// Verify we have permissions - either running as root, or we have group access to the driver
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
if err != nil {
if errors.Is(err, fs.ErrPermission) {
log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
return err
} else if errors.Is(err, fs.ErrNotExist) {
// expected behavior without a radeon card
return nil
}
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
}
fd.Close()
shimMutex.Lock()
defer shimMutex.Unlock()
if llm != nil {
return nil
}
var libName string
switch runtime.GOOS {
case "darwin":
// shouldn't happen
return nil
case "linux":
libName = "librocm_server.so"
case "windows":
libName = "rocm_server.dll"
default:
// shouldn't happen
return nil
}
libPath := C.CString(filepath.Join(workdir, libName))
defer C.free(unsafe.Pointer(libPath))
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var srv C.struct_rocm_llama_server
C.rocm_shim_init(libPath, &srv, &resp)
if resp.id < 0 {
// TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm
// and run against CPU
return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg))
}
llm = &shimExtServer{
s: srv,
options: api.DefaultOptions(),
}
return nil
}
func extractLib(workDir string) error {
files, err := fs.Glob(libEmbed, "llama.cpp/gguf/build/*/lib/*rocm_server*")
if err != nil || len(files) == 0 {
// this is expected, ollama may be compiled without shim library packed in
return RocmShimMissing
}
if len(files) != 1 {
// Shouldn't happen, but just use the first one we find
log.Printf("WARNING: multiple rocm libraries detected - using %s", files[0])
}
srcFile, err := libEmbed.Open(files[0])
if err != nil {
return fmt.Errorf("read ROCm shim %s: %v", files[0], err)
}
defer srcFile.Close()
if err := os.MkdirAll(workDir, 0o755); err != nil {
return fmt.Errorf("create ROCm shim temp dir %s: %v", workDir, err)
}
destFile := filepath.Join(workDir, filepath.Base(files[0]))
_, err = os.Stat(destFile)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write ROCm shim %s: %v", files[0], err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, srcFile); err != nil {
return fmt.Errorf("copy ROCm shim %s: %v", files[0], err)
}
case err != nil:
return fmt.Errorf("stat ROCm shim %s: %v", files[0], err)
}
NoShim = false
return nil
}

View file

@ -8,7 +8,7 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version
mkdir -p dist mkdir -p dist
for TARGETARCH in amd64 arm64; do for TARGETARCH in amd64 arm64; do
docker buildx build --load --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH . docker buildx build --load --progress=plain --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
docker rm builder-$TARGETARCH docker rm builder-$TARGETARCH

68
scripts/build_remote.py Executable file
View file

@ -0,0 +1,68 @@
#!/usr/bin/env python3
import subprocess
import sys
from urllib.parse import urlparse
from git import Repo
# Helper script to be able to build on remote repos using git to push local changes
# (e.g. particularly helpful to target a remote windows build system)
#
# Typical windows remote git config looks like this:
#
#[remote "windows-pa"]
# url = jdoe@desktop-foo:C:/Users/Jdoe/code/ollama
# fetch = +refs/heads/*:refs/remotes/windows-pa/*
# uploadpack = powershell git upload-pack
# receivepack = powershell git receive-pack
#
# TODO - add argpare and make this more configurable
# - force flag becomes optional
# - generate, build or test ...
# Note: remote repo will need this run once:
# git config --local receive.denyCurrentBranch updateInstead
repo = Repo(".")
# On linux, add links in /usr/local/bin to the go binaries to avoid needing this
# GoCmd = "/usr/local/go/bin/go"
GoCmd = "go"
if repo.is_dirty():
print("Tree is dirty. Commit your changes before running this script")
sys.exit(1)
if len(sys.argv) != 2:
print("Please specify the remote name: " + ', '.join([r.name for r in repo.remotes]))
sys.exit(1)
remote_name = sys.argv[1]
remote = {r.name: r for r in repo.remotes}[remote_name]
raw_url = list(remote.urls)[0]
url = urlparse(raw_url)
# Windows urls don't quite parse properly
if url.scheme == "" and url.netloc == "":
url = urlparse("ssh://" + raw_url)
print("URL: " + str(url))
netloc = url.netloc.split(":")[0]
path = url.path
branch_name = repo.active_branch.name
print("Force pushing content to remote...")
# Use with care given the force push
remote.push(force=True).raise_if_error()
print("Ensuring correct branch checked out on remote via ssh...")
subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'git', 'checkout', branch_name])
# TODO - add some hardening to try to figure out how to set up the path properly
# subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'env'])
# TODO - or consider paramiko maybe
print("Performing generate")
subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'generate', './...'])
print("Building")
subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.'])

View file

@ -32,4 +32,4 @@ for LAYER in $(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_M
curl -L -C - --header "${ACCEPT_HEADER}" \ curl -L -C - --header "${ACCEPT_HEADER}" \
-o ${OLLAMA_MODELS}/blobs/${LAYER} \ -o ${OLLAMA_MODELS}/blobs/${LAYER} \
${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${LAYER} ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${LAYER}
done done

View file

@ -2,14 +2,17 @@ package server
import ( import (
"context" "context"
"os"
"strings" "strings"
"sync" "sync"
"testing" "testing"
"time" "time"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/llm"
) )
// TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server // TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
@ -33,12 +36,16 @@ var (
} }
resp = [2]string{ resp = [2]string{
"once upon a time", "once upon a time",
"fourth thursday", "united states thanksgiving",
} }
) )
func TestIntegrationSimpleOrcaMini(t *testing.T) { func TestIntegrationSimpleOrcaMini(t *testing.T) {
SkipIFNoTestData(t) SkipIFNoTestData(t)
workDir, err := os.MkdirTemp("", "ollama")
require.NoError(t, err)
defer os.RemoveAll(workDir)
require.NoError(t, llm.Init(workDir))
ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
defer cancel() defer cancel()
opts := api.DefaultOptions() opts := api.DefaultOptions()
@ -56,7 +63,13 @@ func TestIntegrationSimpleOrcaMini(t *testing.T) {
// get true concurrency working with n_parallel support in the backend // get true concurrency working with n_parallel support in the backend
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
SkipIFNoTestData(t) SkipIFNoTestData(t)
t.Skip("concurrent prediction on single runner not currently supported") t.Skip("concurrent prediction on single runner not currently supported")
workDir, err := os.MkdirTemp("", "ollama")
require.NoError(t, err)
defer os.RemoveAll(workDir)
require.NoError(t, llm.Init(workDir))
ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
defer cancel() defer cancel()
opts := api.DefaultOptions() opts := api.DefaultOptions()
@ -79,6 +92,10 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) { func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) {
SkipIFNoTestData(t) SkipIFNoTestData(t)
workDir, err := os.MkdirTemp("", "ollama")
require.NoError(t, err)
defer os.RemoveAll(workDir)
require.NoError(t, llm.Init(workDir))
ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
defer cancel() defer cancel()
opts := api.DefaultOptions() opts := api.DefaultOptions()
@ -87,6 +104,7 @@ func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) {
var wg sync.WaitGroup var wg sync.WaitGroup
wg.Add(len(req)) wg.Add(len(req))
t.Logf("Running %d concurrently", len(req))
for i := 0; i < len(req); i++ { for i := 0; i < len(req); i++ {
go func(i int) { go func(i int) {
defer wg.Done() defer wg.Done()

View file

@ -25,6 +25,7 @@ import (
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/gpu"
"github.com/jmorganca/ollama/llm" "github.com/jmorganca/ollama/llm"
"github.com/jmorganca/ollama/parser" "github.com/jmorganca/ollama/parser"
"github.com/jmorganca/ollama/version" "github.com/jmorganca/ollama/version"
@ -81,20 +82,6 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess
return nil, err return nil, err
} }
ctx := c.Request.Context()
// check if the loaded model is still running in a subprocess, in case something unexpected happened
if loaded.runner != nil {
if err := loaded.runner.Ping(ctx); err != nil {
log.Print("loaded llm process not responding, closing now")
// the subprocess is no longer running, so close it
loaded.runner.Close()
loaded.runner = nil
loaded.Model = nil
loaded.Options = nil
}
}
needLoad := loaded.runner == nil || // is there a model loaded? needLoad := loaded.runner == nil || // is there a model loaded?
loaded.ModelPath != model.ModelPath || // has the base model changed? loaded.ModelPath != model.ModelPath || // has the base model changed?
!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed? !reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
@ -905,9 +892,12 @@ func Serve(ln net.Listener) error {
os.Exit(0) os.Exit(0)
}() }()
if runtime.GOOS == "linux" { if err := llm.Init(s.WorkDir); err != nil {
return fmt.Errorf("unable to initialize llm library %w", err)
}
if runtime.GOOS == "linux" { // TODO - windows too
// check compatibility to log warnings // check compatibility to log warnings
if _, err := llm.CheckVRAM(); err != nil { if _, err := gpu.CheckVRAM(); err != nil {
log.Print(err.Error()) log.Print(err.Error())
} }
} }