Revamp the dynamic library shim
This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped.
This commit is contained in:
parent
1d1eb1688c
commit
7555ea44f8
14 changed files with 272 additions and 280 deletions
|
@ -3,7 +3,7 @@ FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64
|
||||||
ARG CUDA_VERSION=11.3.1-1
|
ARG CUDA_VERSION=11.3.1-1
|
||||||
ARG CMAKE_VERSION=3.22.1
|
ARG CMAKE_VERSION=3.22.1
|
||||||
# ROCm only supports amd64
|
# ROCm only supports amd64
|
||||||
ARG ROCM_VERSION=5.7
|
ARG ROCM_VERSION=6.0
|
||||||
|
|
||||||
# Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
|
# Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
|
|
|
@ -65,7 +65,7 @@ func GetGPUInfo() GpuInfo {
|
||||||
}
|
}
|
||||||
|
|
||||||
var memInfo C.mem_info_t
|
var memInfo C.mem_info_t
|
||||||
resp := GpuInfo{"", 0, 0}
|
resp := GpuInfo{"", "", 0, 0}
|
||||||
if gpuHandles.cuda != nil {
|
if gpuHandles.cuda != nil {
|
||||||
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
|
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
|
@ -73,6 +73,7 @@ func GetGPUInfo() GpuInfo {
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
} else {
|
} else {
|
||||||
resp.Driver = "CUDA"
|
resp.Driver = "CUDA"
|
||||||
|
resp.Library = "cuda_server"
|
||||||
}
|
}
|
||||||
} else if gpuHandles.rocm != nil {
|
} else if gpuHandles.rocm != nil {
|
||||||
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
||||||
|
@ -81,11 +82,14 @@ func GetGPUInfo() GpuInfo {
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
} else {
|
} else {
|
||||||
resp.Driver = "ROCM"
|
resp.Driver = "ROCM"
|
||||||
|
resp.Library = "rocm_server"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if resp.Driver == "" {
|
if resp.Driver == "" {
|
||||||
C.cpu_check_ram(&memInfo)
|
C.cpu_check_ram(&memInfo)
|
||||||
resp.Driver = "CPU"
|
resp.Driver = "CPU"
|
||||||
|
// In the future we may offer multiple CPU variants to tune CPU features
|
||||||
|
resp.Library = "default"
|
||||||
}
|
}
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
|
log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
|
||||||
|
|
|
@ -21,6 +21,7 @@ func GetGPUInfo() GpuInfo {
|
||||||
|
|
||||||
return GpuInfo{
|
return GpuInfo{
|
||||||
Driver: "METAL",
|
Driver: "METAL",
|
||||||
|
Library: "default",
|
||||||
TotalMemory: 0,
|
TotalMemory: 0,
|
||||||
FreeMemory: 0,
|
FreeMemory: 0,
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package gpu
|
||||||
// Beginning of an `ollama info` command
|
// Beginning of an `ollama info` command
|
||||||
type GpuInfo struct {
|
type GpuInfo struct {
|
||||||
Driver string `json:"driver,omitempty"`
|
Driver string `json:"driver,omitempty"`
|
||||||
|
Library string `json:"library,omitempty"`
|
||||||
TotalMemory uint64 `json:"total_memory,omitempty"`
|
TotalMemory uint64 `json:"total_memory,omitempty"`
|
||||||
FreeMemory uint64 `json:"free_memory,omitempty"`
|
FreeMemory uint64 `json:"free_memory,omitempty"`
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#include "rocm_shim.h"
|
#include "dynamic_shim.h"
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
@ -28,8 +28,8 @@ inline static char *LOAD_ERR() {
|
||||||
#define UNLOAD_LIBRARY(handle) dlclose(handle)
|
#define UNLOAD_LIBRARY(handle) dlclose(handle)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
|
void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
|
||||||
ext_server_resp_t *err) {
|
ext_server_resp_t *err) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
struct lookup {
|
struct lookup {
|
||||||
char *s;
|
char *s;
|
||||||
|
@ -57,11 +57,8 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
|
||||||
s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
|
s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
|
||||||
if (!s->handle) {
|
if (!s->handle) {
|
||||||
err->id = -1;
|
err->id = -1;
|
||||||
snprintf(
|
snprintf(err->msg, err->msg_len,
|
||||||
err->msg, err->msg_len,
|
"Unable to load dynamic server library: %s", LOAD_ERR());
|
||||||
"Unable to load rocm server library: %s (If you have a Radeon card, "
|
|
||||||
"did you install the ROCM libraries?)",
|
|
||||||
LOAD_ERR());
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,64 +74,63 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void rocm_shim_llama_server_init(struct rocm_llama_server s,
|
inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
|
||||||
ext_server_params_t *sparams,
|
ext_server_params_t *sparams,
|
||||||
ext_server_resp_t *err) {
|
ext_server_resp_t *err) {
|
||||||
s.llama_server_init(sparams, err);
|
s.llama_server_init(sparams, err);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void rocm_shim_llama_server_start(struct rocm_llama_server s) {
|
inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) {
|
||||||
s.llama_server_start();
|
s.llama_server_start();
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void rocm_shim_llama_server_stop(struct rocm_llama_server s) {
|
inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) {
|
||||||
s.llama_server_stop();
|
s.llama_server_stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void rocm_shim_llama_server_completion(struct rocm_llama_server s,
|
inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
|
||||||
const char *json_req,
|
const char *json_req,
|
||||||
ext_server_resp_t *resp) {
|
ext_server_resp_t *resp) {
|
||||||
s.llama_server_completion(json_req, resp);
|
s.llama_server_completion(json_req, resp);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void rocm_shim_llama_server_completion_next_result(
|
inline void dynamic_shim_llama_server_completion_next_result(
|
||||||
struct rocm_llama_server s, const int task_id,
|
struct dynamic_llama_server s, const int task_id,
|
||||||
ext_server_task_result_t *result) {
|
ext_server_task_result_t *result) {
|
||||||
s.llama_server_completion_next_result(task_id, result);
|
s.llama_server_completion_next_result(task_id, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s,
|
inline void dynamic_shim_llama_server_completion_cancel(
|
||||||
const int task_id,
|
struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
|
||||||
ext_server_resp_t *err) {
|
|
||||||
s.llama_server_completion_cancel(task_id, err);
|
s.llama_server_completion_cancel(task_id, err);
|
||||||
}
|
}
|
||||||
inline void rocm_shim_llama_server_release_task_result(
|
inline void dynamic_shim_llama_server_release_task_result(
|
||||||
struct rocm_llama_server s, ext_server_task_result_t *result) {
|
struct dynamic_llama_server s, ext_server_task_result_t *result) {
|
||||||
s.llama_server_release_task_result(result);
|
s.llama_server_release_task_result(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void rocm_shim_llama_server_tokenize(struct rocm_llama_server s,
|
inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
|
||||||
const char *json_req,
|
const char *json_req,
|
||||||
char **json_resp,
|
char **json_resp,
|
||||||
ext_server_resp_t *err) {
|
ext_server_resp_t *err) {
|
||||||
s.llama_server_tokenize(json_req, json_resp, err);
|
s.llama_server_tokenize(json_req, json_resp, err);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void rocm_shim_llama_server_detokenize(struct rocm_llama_server s,
|
inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
|
||||||
const char *json_req,
|
const char *json_req,
|
||||||
char **json_resp,
|
char **json_resp,
|
||||||
ext_server_resp_t *err) {
|
ext_server_resp_t *err) {
|
||||||
s.llama_server_detokenize(json_req, json_resp, err);
|
s.llama_server_detokenize(json_req, json_resp, err);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void rocm_shim_llama_server_embedding(struct rocm_llama_server s,
|
inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
|
||||||
const char *json_req,
|
const char *json_req,
|
||||||
char **json_resp,
|
char **json_resp,
|
||||||
ext_server_resp_t *err) {
|
ext_server_resp_t *err) {
|
||||||
s.llama_server_embedding(json_req, json_resp, err);
|
s.llama_server_embedding(json_req, json_resp, err);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s,
|
inline void dynamic_shim_llama_server_release_json_resp(
|
||||||
char **json_resp) {
|
struct dynamic_llama_server s, char **json_resp) {
|
||||||
s.llama_server_release_json_resp(json_resp);
|
s.llama_server_release_json_resp(json_resp);
|
||||||
}
|
}
|
74
llm/dynamic_shim.h
Normal file
74
llm/dynamic_shim.h
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include "server.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
struct dynamic_llama_server {
|
||||||
|
void *handle;
|
||||||
|
void (*llama_server_init)(ext_server_params_t *sparams,
|
||||||
|
ext_server_resp_t *err);
|
||||||
|
void (*llama_server_start)();
|
||||||
|
void (*llama_server_stop)();
|
||||||
|
void (*llama_server_completion)(const char *json_req,
|
||||||
|
ext_server_resp_t *resp);
|
||||||
|
void (*llama_server_completion_next_result)(const int task_id,
|
||||||
|
ext_server_task_result_t *result);
|
||||||
|
void (*llama_server_completion_cancel)(const int task_id,
|
||||||
|
ext_server_resp_t *err);
|
||||||
|
void (*llama_server_release_task_result)(ext_server_task_result_t *result);
|
||||||
|
void (*llama_server_tokenize)(const char *json_req, char **json_resp,
|
||||||
|
ext_server_resp_t *err);
|
||||||
|
void (*llama_server_detokenize)(const char *json_req, char **json_resp,
|
||||||
|
ext_server_resp_t *err);
|
||||||
|
void (*llama_server_embedding)(const char *json_req, char **json_resp,
|
||||||
|
ext_server_resp_t *err);
|
||||||
|
void (*llama_server_release_json_resp)(char **json_resp);
|
||||||
|
};
|
||||||
|
|
||||||
|
void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
|
||||||
|
ext_server_resp_t *err);
|
||||||
|
|
||||||
|
// No good way to call C function pointers from Go so inline the indirection
|
||||||
|
void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
|
||||||
|
ext_server_params_t *sparams,
|
||||||
|
ext_server_resp_t *err);
|
||||||
|
|
||||||
|
void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
|
||||||
|
|
||||||
|
void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
|
||||||
|
|
||||||
|
void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
|
||||||
|
const char *json_req,
|
||||||
|
ext_server_resp_t *resp);
|
||||||
|
|
||||||
|
void dynamic_shim_llama_server_completion_next_result(
|
||||||
|
struct dynamic_llama_server s, const int task_id,
|
||||||
|
ext_server_task_result_t *result);
|
||||||
|
|
||||||
|
void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
|
||||||
|
const int task_id,
|
||||||
|
ext_server_resp_t *err);
|
||||||
|
|
||||||
|
void dynamic_shim_llama_server_release_task_result(
|
||||||
|
struct dynamic_llama_server s, ext_server_task_result_t *result);
|
||||||
|
|
||||||
|
void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
|
||||||
|
const char *json_req, char **json_resp,
|
||||||
|
ext_server_resp_t *err);
|
||||||
|
|
||||||
|
void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
|
||||||
|
const char *json_req,
|
||||||
|
char **json_resp,
|
||||||
|
ext_server_resp_t *err);
|
||||||
|
|
||||||
|
void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
|
||||||
|
const char *json_req, char **json_resp,
|
||||||
|
ext_server_resp_t *err);
|
||||||
|
void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
|
||||||
|
char **json_resp);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
|
@ -17,7 +17,10 @@ package llm
|
||||||
#cgo linux CFLAGS: -D_GNU_SOURCE
|
#cgo linux CFLAGS: -D_GNU_SOURCE
|
||||||
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
|
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
|
||||||
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
|
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
|
||||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libollama.a
|
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a
|
||||||
|
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a
|
||||||
|
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a
|
||||||
|
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a
|
||||||
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
||||||
#cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin
|
#cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin
|
||||||
#cgo windows LDFLAGS: -lext_server_shared -lpthread
|
#cgo windows LDFLAGS: -lext_server_shared -lpthread
|
||||||
|
@ -121,7 +124,7 @@ func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
|
||||||
C.llama_server_release_json_resp(json_resp)
|
C.llama_server_release_json_resp(json_resp)
|
||||||
}
|
}
|
||||||
|
|
||||||
func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
||||||
server := &llamaExtServer{opts}
|
server := &llamaExtServer{opts}
|
||||||
return newExtServer(server, model, adapters, projectors, numLayers, opts)
|
return newExtServer(server, model, adapters, projectors, numLayers, opts)
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,7 @@ init_vars() {
|
||||||
CMAKE_DEFS="-DLLAMA_ACCELERATE=on"
|
CMAKE_DEFS="-DLLAMA_ACCELERATE=on"
|
||||||
# TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings
|
# TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings
|
||||||
CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
|
CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
|
||||||
if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then
|
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
|
||||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}"
|
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}"
|
||||||
else
|
else
|
||||||
# TODO - add additional optimization flags...
|
# TODO - add additional optimization flags...
|
||||||
|
@ -15,7 +15,7 @@ init_vars() {
|
||||||
}
|
}
|
||||||
|
|
||||||
git_module_setup() {
|
git_module_setup() {
|
||||||
if [ -n "${OLLAMA_SKIP_PATCHING}" ] ; then
|
if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
|
||||||
echo "Skipping submodule initialization"
|
echo "Skipping submodule initialization"
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
|
@ -25,13 +25,13 @@ git_module_setup() {
|
||||||
}
|
}
|
||||||
|
|
||||||
apply_patches() {
|
apply_patches() {
|
||||||
if [ -n "${OLLAMA_SKIP_PATCHING}" ] ; then
|
if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
|
||||||
echo "Skipping submodule patching"
|
echo "Skipping submodule patching"
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
# Workaround git apply not handling creation well for iteration
|
# Workaround git apply not handling creation well for iteration
|
||||||
rm -f gguf/examples/server/server.h
|
rm -f gguf/examples/server/server.h
|
||||||
for patch in ${PATCHES} ; do
|
for patch in ${PATCHES}; do
|
||||||
git -C gguf apply ../patches/${patch}
|
git -C gguf apply ../patches/${patch}
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,81 +1,81 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# This script is intended to run inside the go generate
|
# This script is intended to run inside the go generate
|
||||||
# working directory must be ../llm/llama.cpp
|
# working directory must be llm/llama.cpp
|
||||||
|
|
||||||
|
# First we build our default built-in library which will be linked into the CGO
|
||||||
|
# binary as a normal dependency. This default build is CPU based.
|
||||||
|
#
|
||||||
|
# Then we build a CUDA dynamic library (although statically linked with the CUDA
|
||||||
|
# library dependencies for maximum portability)
|
||||||
|
#
|
||||||
|
# Then if we detect ROCm, we build a dynamically loaded ROCm lib. ROCm is particularly
|
||||||
|
# important to be a dynamic lib even if it's the only GPU library detected because
|
||||||
|
# we can't redistribute the objectfiles but must rely on dynamic libraries at
|
||||||
|
# runtime, which could lead the server not to start if not present.
|
||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
echo "Starting linux generate script"
|
echo "Starting linux generate script"
|
||||||
if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ] ; then
|
if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then
|
||||||
export CUDACXX=/usr/local/cuda/bin/nvcc
|
export CUDACXX=/usr/local/cuda/bin/nvcc
|
||||||
fi
|
fi
|
||||||
|
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
|
||||||
|
OLLAMA_DYN_LIB_DIR="gguf/build/lib"
|
||||||
|
mkdir -p ${OLLAMA_DYN_LIB_DIR}
|
||||||
|
touch ${OLLAMA_DYN_LIB_DIR}/.generated
|
||||||
source $(dirname $0)/gen_common.sh
|
source $(dirname $0)/gen_common.sh
|
||||||
init_vars
|
init_vars
|
||||||
git_module_setup
|
git_module_setup
|
||||||
apply_patches
|
apply_patches
|
||||||
if [ -d /usr/local/cuda/lib64/ ] ; then
|
|
||||||
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
#
|
||||||
else
|
# CPU first for the default library
|
||||||
CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
#
|
||||||
fi
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="gguf/build/cuda"
|
BUILD_DIR="gguf/build/cpu"
|
||||||
LIB_DIR="${BUILD_DIR}/lib"
|
|
||||||
mkdir -p ../../dist/
|
|
||||||
build
|
build
|
||||||
|
|
||||||
if [ -d /usr/local/cuda/lib64/ ] ; then
|
if [ -d /usr/local/cuda/lib64/ ]; then
|
||||||
pwd
|
echo "CUDA libraries detected - building dynamic CUDA library"
|
||||||
ar -M <<EOF
|
init_vars
|
||||||
create ${BUILD_DIR}/libollama.a
|
CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||||
addlib ${BUILD_DIR}/examples/server/libext_server.a
|
BUILD_DIR="gguf/build/cuda"
|
||||||
addlib ${BUILD_DIR}/common/libcommon.a
|
CUDA_LIB_DIR=/usr/local/cuda/lib64
|
||||||
addlib ${BUILD_DIR}/libllama.a
|
build
|
||||||
addlib ${BUILD_DIR}/libggml_static.a
|
gcc -fPIC -g -shared -o ${OLLAMA_DYN_LIB_DIR}/libcuda_server.so \
|
||||||
addlib /usr/local/cuda/lib64/libcudart_static.a
|
-Wl,--whole-archive \
|
||||||
addlib /usr/local/cuda/lib64/libcublas_static.a
|
${BUILD_DIR}/examples/server/libext_server.a \
|
||||||
addlib /usr/local/cuda/lib64/libcublasLt_static.a
|
${BUILD_DIR}/common/libcommon.a \
|
||||||
addlib /usr/local/cuda/lib64/libcudadevrt.a
|
${BUILD_DIR}/libllama.a \
|
||||||
addlib /usr/local/cuda/lib64/libculibos.a
|
-Wl,--no-whole-archive \
|
||||||
save
|
${CUDA_LIB_DIR}/libcudart_static.a \
|
||||||
end
|
${CUDA_LIB_DIR}/libcublas_static.a \
|
||||||
EOF
|
${CUDA_LIB_DIR}/libcublasLt_static.a \
|
||||||
else
|
${CUDA_LIB_DIR}/libcudadevrt.a \
|
||||||
ar -M <<EOF
|
${CUDA_LIB_DIR}/libculibos.a \
|
||||||
create ${BUILD_DIR}/libollama.a
|
-lrt -lpthread -ldl -lstdc++ -lm
|
||||||
addlib ${BUILD_DIR}/examples/server/libext_server.a
|
|
||||||
addlib ${BUILD_DIR}/common/libcommon.a
|
|
||||||
addlib ${BUILD_DIR}/libllama.a
|
|
||||||
addlib ${BUILD_DIR}/libggml_static.a
|
|
||||||
save
|
|
||||||
end
|
|
||||||
EOF
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z "${ROCM_PATH}" ] ; then
|
if [ -z "${ROCM_PATH}" ]; then
|
||||||
# Try the default location in case it exists
|
# Try the default location in case it exists
|
||||||
ROCM_PATH=/opt/rocm
|
ROCM_PATH=/opt/rocm
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z "${CLBlast_DIR}" ] ; then
|
if [ -z "${CLBlast_DIR}" ]; then
|
||||||
# Try the default location in case it exists
|
# Try the default location in case it exists
|
||||||
if [ -d /usr/lib/cmake/CLBlast ]; then
|
if [ -d /usr/lib/cmake/CLBlast ]; then
|
||||||
export CLBlast_DIR=/usr/lib/cmake/CLBlast
|
export CLBlast_DIR=/usr/lib/cmake/CLBlast
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
BUILD_DIR="gguf/build/rocm"
|
if [ -d "${ROCM_PATH}" ]; then
|
||||||
LIB_DIR="${BUILD_DIR}/lib"
|
echo "ROCm libraries detected - building dynamic ROCm library"
|
||||||
mkdir -p ${LIB_DIR}
|
|
||||||
# Ensure we have at least one file present for the embed
|
|
||||||
touch ${LIB_DIR}/.generated
|
|
||||||
|
|
||||||
if [ -d "${ROCM_PATH}" ] ; then
|
|
||||||
echo "Building ROCm"
|
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
|
||||||
CMAKE_DEFS="-DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
BUILD_DIR="gguf/build/rocm"
|
||||||
build
|
build
|
||||||
gcc -fPIC -g -shared -o ${LIB_DIR}/librocm_server.so \
|
gcc -fPIC -g -shared -o ${OLLAMA_DYN_LIB_DIR}/librocm_server.so \
|
||||||
-Wl,--whole-archive \
|
-Wl,--whole-archive \
|
||||||
${BUILD_DIR}/examples/server/libext_server.a \
|
${BUILD_DIR}/examples/server/libext_server.a \
|
||||||
${BUILD_DIR}/common/libcommon.a \
|
${BUILD_DIR}/common/libcommon.a \
|
||||||
|
|
60
llm/llama.go
60
llm/llama.go
|
@ -8,7 +8,6 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
"log"
|
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
@ -120,7 +119,7 @@ type ImageData struct {
|
||||||
var (
|
var (
|
||||||
errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
|
errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
|
||||||
errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
|
errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
|
||||||
payloadMissing = fmt.Errorf("expected payload not included in this build of ollama")
|
payloadMissing = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
|
||||||
)
|
)
|
||||||
|
|
||||||
// StatusWriter is a writer that captures error messages from the llama runner process
|
// StatusWriter is a writer that captures error messages from the llama runner process
|
||||||
|
@ -208,41 +207,40 @@ type EmbeddingResponse struct {
|
||||||
Embedding []float64 `json:"embedding"`
|
Embedding []float64 `json:"embedding"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractLib(workDir, glob string) error {
|
func extractDynamicLibs(workDir, glob string) ([]string, error) {
|
||||||
files, err := fs.Glob(libEmbed, glob)
|
files, err := fs.Glob(libEmbed, glob)
|
||||||
if err != nil || len(files) == 0 {
|
if err != nil || len(files) == 0 {
|
||||||
return payloadMissing
|
return nil, payloadMissing
|
||||||
}
|
}
|
||||||
|
libs := make([]string, len(files))
|
||||||
|
|
||||||
if len(files) != 1 {
|
for i, file := range files {
|
||||||
// Shouldn't happen, but just use the first one we find
|
srcFile, err := libEmbed.Open(file)
|
||||||
log.Printf("WARNING: multiple payloads detected - using %s", files[0])
|
|
||||||
}
|
|
||||||
|
|
||||||
srcFile, err := libEmbed.Open(files[0])
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("read payload %s: %v", files[0], err)
|
|
||||||
}
|
|
||||||
defer srcFile.Close()
|
|
||||||
if err := os.MkdirAll(workDir, 0o755); err != nil {
|
|
||||||
return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
destFile := filepath.Join(workDir, filepath.Base(files[0]))
|
|
||||||
|
|
||||||
_, err = os.Stat(destFile)
|
|
||||||
switch {
|
|
||||||
case errors.Is(err, os.ErrNotExist):
|
|
||||||
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("write payload %s: %v", files[0], err)
|
return nil, fmt.Errorf("read payload %s: %v", file, err)
|
||||||
}
|
}
|
||||||
defer destFile.Close()
|
defer srcFile.Close()
|
||||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
if err := os.MkdirAll(workDir, 0o755); err != nil {
|
||||||
return fmt.Errorf("copy payload %s: %v", files[0], err)
|
return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
destFile := filepath.Join(workDir, filepath.Base(file))
|
||||||
|
libs[i] = destFile
|
||||||
|
|
||||||
|
_, err = os.Stat(destFile)
|
||||||
|
switch {
|
||||||
|
case errors.Is(err, os.ErrNotExist):
|
||||||
|
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("write payload %s: %v", file, err)
|
||||||
|
}
|
||||||
|
defer destFile.Close()
|
||||||
|
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||||
|
return nil, fmt.Errorf("copy payload %s: %v", file, err)
|
||||||
|
}
|
||||||
|
case err != nil:
|
||||||
|
return nil, fmt.Errorf("stat payload %s: %v", file, err)
|
||||||
}
|
}
|
||||||
case err != nil:
|
|
||||||
return fmt.Errorf("stat payload %s: %v", files[0], err)
|
|
||||||
}
|
}
|
||||||
return nil
|
return libs, nil
|
||||||
}
|
}
|
||||||
|
|
23
llm/llm.go
23
llm/llm.go
|
@ -22,8 +22,7 @@ type LLM interface {
|
||||||
Close()
|
Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set to false on linux/windows if we are able to load the shim
|
var AvailableShims = map[string]string{}
|
||||||
var ShimPresent = false
|
|
||||||
|
|
||||||
func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||||
if _, err := os.Stat(model); err != nil {
|
if _, err := os.Stat(model); err != nil {
|
||||||
|
@ -82,15 +81,23 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||||
opts.RopeFrequencyBase = 0.0
|
opts.RopeFrequencyBase = 0.0
|
||||||
opts.RopeFrequencyScale = 0.0
|
opts.RopeFrequencyScale = 0.0
|
||||||
gpuInfo := gpu.GetGPUInfo()
|
gpuInfo := gpu.GetGPUInfo()
|
||||||
if gpuInfo.Driver == "ROCM" && ShimPresent {
|
return newLlmServer(gpuInfo.Library, model, adapters, projectors, ggml.NumLayers(), opts)
|
||||||
return newRocmShimExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
|
|
||||||
} else {
|
|
||||||
// Rely on the built-in CUDA/Metal based server which will fall back to CPU
|
|
||||||
return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Give any native cgo implementations an opportunity to initialize
|
// Give any native cgo implementations an opportunity to initialize
|
||||||
func Init(workdir string) error {
|
func Init(workdir string) error {
|
||||||
return nativeInit(workdir)
|
return nativeInit(workdir)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func newLlmServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
||||||
|
if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
|
||||||
|
srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, numLayers, opts)
|
||||||
|
if err == nil {
|
||||||
|
return srv, nil
|
||||||
|
}
|
||||||
|
log.Printf("Failed to load dynamic library - falling back to CPU mode %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return newDefaultExtServer(model, adapters, projectors, numLayers, opts)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
|
@ -1,73 +0,0 @@
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
#include "server.h"
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
struct rocm_llama_server {
|
|
||||||
void *handle;
|
|
||||||
void (*llama_server_init)(ext_server_params_t *sparams,
|
|
||||||
ext_server_resp_t *err);
|
|
||||||
void (*llama_server_start)();
|
|
||||||
void (*llama_server_stop)();
|
|
||||||
void (*llama_server_completion)(const char *json_req,
|
|
||||||
ext_server_resp_t *resp);
|
|
||||||
void (*llama_server_completion_next_result)(const int task_id,
|
|
||||||
ext_server_task_result_t *result);
|
|
||||||
void (*llama_server_completion_cancel)(const int task_id,
|
|
||||||
ext_server_resp_t *err);
|
|
||||||
void (*llama_server_release_task_result)(ext_server_task_result_t *result);
|
|
||||||
void (*llama_server_tokenize)(const char *json_req, char **json_resp,
|
|
||||||
ext_server_resp_t *err);
|
|
||||||
void (*llama_server_detokenize)(const char *json_req, char **json_resp,
|
|
||||||
ext_server_resp_t *err);
|
|
||||||
void (*llama_server_embedding)(const char *json_req, char **json_resp,
|
|
||||||
ext_server_resp_t *err);
|
|
||||||
void (*llama_server_release_json_resp)(char **json_resp);
|
|
||||||
};
|
|
||||||
|
|
||||||
void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
|
|
||||||
ext_server_resp_t *err);
|
|
||||||
|
|
||||||
// No good way to call C function pointers from Go so inline the indirection
|
|
||||||
void rocm_shim_llama_server_init(struct rocm_llama_server s,
|
|
||||||
ext_server_params_t *sparams,
|
|
||||||
ext_server_resp_t *err);
|
|
||||||
|
|
||||||
void rocm_shim_llama_server_start(struct rocm_llama_server s);
|
|
||||||
|
|
||||||
void rocm_shim_llama_server_stop(struct rocm_llama_server s);
|
|
||||||
|
|
||||||
void rocm_shim_llama_server_completion(struct rocm_llama_server s,
|
|
||||||
const char *json_req,
|
|
||||||
ext_server_resp_t *resp);
|
|
||||||
|
|
||||||
void rocm_shim_llama_server_completion_next_result(
|
|
||||||
struct rocm_llama_server s, const int task_id,
|
|
||||||
ext_server_task_result_t *result);
|
|
||||||
|
|
||||||
void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s,
|
|
||||||
const int task_id,
|
|
||||||
ext_server_resp_t *err);
|
|
||||||
|
|
||||||
void rocm_shim_llama_server_release_task_result(
|
|
||||||
struct rocm_llama_server s, ext_server_task_result_t *result);
|
|
||||||
|
|
||||||
void rocm_shim_llama_server_tokenize(struct rocm_llama_server s,
|
|
||||||
const char *json_req, char **json_resp,
|
|
||||||
ext_server_resp_t *err);
|
|
||||||
|
|
||||||
void rocm_shim_llama_server_detokenize(struct rocm_llama_server s,
|
|
||||||
const char *json_req, char **json_resp,
|
|
||||||
ext_server_resp_t *err);
|
|
||||||
|
|
||||||
void rocm_shim_llama_server_embedding(struct rocm_llama_server s,
|
|
||||||
const char *json_req, char **json_resp,
|
|
||||||
ext_server_resp_t *err);
|
|
||||||
void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s,
|
|
||||||
char **json_resp);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
|
@ -12,13 +12,13 @@ import (
|
||||||
//go:embed llama.cpp/gguf/ggml-metal.metal
|
//go:embed llama.cpp/gguf/ggml-metal.metal
|
||||||
var libEmbed embed.FS
|
var libEmbed embed.FS
|
||||||
|
|
||||||
func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
||||||
// should never happen...
|
// should never happen...
|
||||||
return nil, fmt.Errorf("ROCM GPUs not supported on Mac")
|
return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
|
||||||
}
|
}
|
||||||
|
|
||||||
func nativeInit(workdir string) error {
|
func nativeInit(workdir string) error {
|
||||||
err := extractLib(workdir, "llama.cpp/gguf/ggml-metal.metal")
|
_, err := extractDynamicLibs(workdir, "llama.cpp/gguf/ggml-metal.metal")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err == payloadMissing {
|
if err == payloadMissing {
|
||||||
// TODO perhaps consider this a hard failure on arm macs?
|
// TODO perhaps consider this a hard failure on arm macs?
|
||||||
|
|
|
@ -5,7 +5,7 @@ package llm
|
||||||
/*
|
/*
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include "rocm_shim.h"
|
#include "dynamic_shim.h"
|
||||||
|
|
||||||
*/
|
*/
|
||||||
import "C"
|
import "C"
|
||||||
|
@ -18,20 +18,20 @@ import (
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
"github.com/jmorganca/ollama/api"
|
"github.com/jmorganca/ollama/api"
|
||||||
)
|
)
|
||||||
|
|
||||||
//go:embed llama.cpp/gguf/build/*/lib/*
|
//go:embed llama.cpp/gguf/build/lib/*
|
||||||
var libEmbed embed.FS
|
var libEmbed embed.FS
|
||||||
|
|
||||||
var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported")
|
var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported")
|
||||||
|
|
||||||
type shimExtServer struct {
|
type shimExtServer struct {
|
||||||
s C.struct_rocm_llama_server
|
s C.struct_dynamic_llama_server
|
||||||
options api.Options
|
options api.Options
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -40,50 +40,58 @@ var shimMutex sync.Mutex
|
||||||
var llm *shimExtServer
|
var llm *shimExtServer
|
||||||
|
|
||||||
func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
|
func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
|
||||||
C.rocm_shim_llama_server_init(llm.s, sparams, err)
|
C.dynamic_shim_llama_server_init(llm.s, sparams, err)
|
||||||
}
|
}
|
||||||
func (llm *shimExtServer) llama_server_start() {
|
func (llm *shimExtServer) llama_server_start() {
|
||||||
C.rocm_shim_llama_server_start(llm.s)
|
C.dynamic_shim_llama_server_start(llm.s)
|
||||||
}
|
}
|
||||||
func (llm *shimExtServer) llama_server_stop() {
|
func (llm *shimExtServer) llama_server_stop() {
|
||||||
C.rocm_shim_llama_server_stop(llm.s)
|
C.dynamic_shim_llama_server_stop(llm.s)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
|
func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
|
||||||
C.rocm_shim_llama_server_completion(llm.s, json_req, resp)
|
C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
|
||||||
}
|
}
|
||||||
func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
|
func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
|
||||||
C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp)
|
C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
|
||||||
}
|
}
|
||||||
func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
|
func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
|
||||||
C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err)
|
C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
|
||||||
}
|
}
|
||||||
func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
|
func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
|
||||||
C.rocm_shim_llama_server_release_task_result(llm.s, result)
|
C.dynamic_shim_llama_server_release_task_result(llm.s, result)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||||
C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
|
C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
|
||||||
}
|
}
|
||||||
func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||||
C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
|
C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
|
||||||
}
|
}
|
||||||
func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||||
C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
|
C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
|
||||||
}
|
}
|
||||||
func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
|
func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
|
||||||
C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp)
|
C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
|
||||||
}
|
}
|
||||||
|
|
||||||
func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
||||||
if !ShimPresent {
|
shimMutex.Lock()
|
||||||
return nil, RocmShimMissing
|
defer shimMutex.Unlock()
|
||||||
|
libPath := C.CString(library)
|
||||||
|
defer C.free(unsafe.Pointer(libPath))
|
||||||
|
resp := newExtServerResp(128)
|
||||||
|
defer freeExtServerResp(resp)
|
||||||
|
var srv C.struct_dynamic_llama_server
|
||||||
|
C.dynamic_shim_init(libPath, &srv, &resp)
|
||||||
|
if resp.id < 0 {
|
||||||
|
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
|
||||||
}
|
}
|
||||||
log.Printf("Loading ROCM llm server")
|
llm = &shimExtServer{
|
||||||
if llm == nil {
|
s: srv,
|
||||||
return nil, fmt.Errorf("nativeInit wasnt called or libary load failed")
|
options: opts,
|
||||||
}
|
}
|
||||||
llm.options = opts
|
log.Printf("Loading Dynamic Shim llm server: %s", library)
|
||||||
return newExtServer(llm, model, adapters, projectors, numLayers, opts)
|
return newExtServer(llm, model, adapters, projectors, numLayers, opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -108,64 +116,37 @@ func (llm *shimExtServer) Close() {
|
||||||
}
|
}
|
||||||
|
|
||||||
func nativeInit(workdir string) error {
|
func nativeInit(workdir string) error {
|
||||||
err := extractLib(workdir, "llama.cpp/gguf/build/*/lib/*rocm_server*")
|
libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/lib/*server*")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err == payloadMissing {
|
if err == payloadMissing {
|
||||||
log.Printf("%s", RocmShimMissing)
|
log.Printf("%s", payloadMissing)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
} else {
|
}
|
||||||
ShimPresent = true
|
for _, lib := range libs {
|
||||||
|
libName := strings.Split(strings.TrimPrefix(filepath.Base(lib), "lib"), ".")[0]
|
||||||
|
AvailableShims[libName] = lib
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verify we have permissions - either running as root, or we have group access to the driver
|
// Only check ROCm access if we have the dynamic lib loaded
|
||||||
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
|
if _, rocmPresent := AvailableShims["rocm_server"]; rocmPresent {
|
||||||
if err != nil {
|
// Verify we have permissions - either running as root, or we have group access to the driver
|
||||||
if errors.Is(err, fs.ErrPermission) {
|
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
|
||||||
log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
|
if err != nil {
|
||||||
return err
|
if errors.Is(err, fs.ErrPermission) {
|
||||||
} else if errors.Is(err, fs.ErrNotExist) {
|
log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
|
||||||
// expected behavior without a radeon card
|
return err
|
||||||
return nil
|
} else if errors.Is(err, fs.ErrNotExist) {
|
||||||
|
// expected behavior without a radeon card
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
|
||||||
}
|
}
|
||||||
|
fd.Close()
|
||||||
|
|
||||||
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
|
|
||||||
}
|
}
|
||||||
fd.Close()
|
|
||||||
|
|
||||||
shimMutex.Lock()
|
|
||||||
defer shimMutex.Unlock()
|
|
||||||
if llm != nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
var libName string
|
|
||||||
switch runtime.GOOS {
|
|
||||||
case "darwin":
|
|
||||||
// shouldn't happen
|
|
||||||
return nil
|
|
||||||
case "linux":
|
|
||||||
libName = "librocm_server.so"
|
|
||||||
case "windows":
|
|
||||||
libName = "rocm_server.dll"
|
|
||||||
default:
|
|
||||||
// shouldn't happen
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
libPath := C.CString(filepath.Join(workdir, libName))
|
|
||||||
defer C.free(unsafe.Pointer(libPath))
|
|
||||||
resp := newExtServerResp(128)
|
|
||||||
defer freeExtServerResp(resp)
|
|
||||||
var srv C.struct_rocm_llama_server
|
|
||||||
C.rocm_shim_init(libPath, &srv, &resp)
|
|
||||||
if resp.id < 0 {
|
|
||||||
// TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm
|
|
||||||
// and run against CPU
|
|
||||||
return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg))
|
|
||||||
}
|
|
||||||
llm = &shimExtServer{
|
|
||||||
s: srv,
|
|
||||||
options: api.DefaultOptions(),
|
|
||||||
}
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue