diff --git a/Dockerfile.build b/Dockerfile.build index ca6b1a29..96b06138 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -1,101 +1,93 @@ ARG GOLANG_VERSION=1.21.3 ARG CMAKE_VERSION=3.22.1 ARG CUDA_VERSION=11.3.1 -ARG ROCM_VERSION=5.7.1 + +# Copy the minimal context we need to run the generate scripts +FROM scratch AS llm-code +COPY .git .git +COPY .gitmodules .gitmodules +COPY llm llm FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64 - ARG CMAKE_VERSION - -RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl \ - && yum update -y \ - && yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 +COPY ./scripts/rh_linux_deps.sh / +RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH - -ADD https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-x86_64.tar.gz /tmp/cmake-$CMAKE_VERSION.tar.gz -RUN tar -zx -C /usr --strip-components 1 +#define ROCM_LOOKUP_SIZE 5 + void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { rsmi_status_t ret; resp->err = NULL; @@ -13,11 +15,12 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { struct lookup { char *s; void **p; - } l[4] = { + } l[ROCM_LOOKUP_SIZE] = { {"rsmi_init", (void *)&resp->rh.initFn}, {"rsmi_shut_down", (void *)&resp->rh.shutdownFn}, {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn}, {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn}, + {"rsmi_version_get", (void *)&resp->rh.versionGetFn}, // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle }, }; @@ -32,7 +35,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { return; } - for (i = 0; i < 4; i++) { + for (i = 0; i < ROCM_LOOKUP_SIZE; i++) { *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s); if (!l[i].p) { UNLOAD_LIBRARY(resp->rh.handle); @@ -103,4 +106,25 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { return; } +void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) { + const int buflen = 256; + char buf[buflen + 1]; + if (h.handle == NULL) { + resp->str = strdup("nvml handle not initialized"); + resp->status = 1; + return; + } + rsmi_version_t ver; + rsmi_status_t ret; + ret = h.versionGetFn(&ver); + if (ret != RSMI_STATUS_SUCCESS) { + snprintf(buf, buflen, "unexpected response on version lookup %d", ret); + resp->status = 1; + } else { + snprintf(buf, buflen, "%d", ver.major); + resp->status = 0; + } + resp->str = strdup(buf); +} + #endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_rocm.h b/gpu/gpu_info_rocm.h index 1f74713b..90d9a09f 100644 --- a/gpu/gpu_info_rocm.h +++ b/gpu/gpu_info_rocm.h @@ -15,12 +15,20 @@ typedef enum rsmi_memory_type { RSMI_MEM_TYPE_GTT, } rsmi_memory_type_t; + typedef struct { + uint32_t major; + uint32_t minor; + uint32_t patch; + const char *build; + } rsmi_version_t; + typedef struct rocm_handle { void *handle; rsmi_status_t (*initFn)(uint64_t); rsmi_status_t (*shutdownFn)(void); rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); + rsmi_status_t (*versionGetFn) (rsmi_version_t *version); // rsmi_status_t (*getHandle)(uint32_t, uint16_t *); } rocm_handle_t; @@ -29,8 +37,14 @@ typedef struct rocm_init_resp { rocm_handle_t rh; } rocm_init_resp_t; +typedef struct rocm_version_resp { + rsmi_status_t status; + char *str; // Contains version or error string if status != 0 +} rocm_version_resp_t; + void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp); void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp); +void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp); #endif // __GPU_INFO_ROCM_H__ #endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_test.go b/gpu/gpu_test.go index b65bce81..f57597b5 100644 --- a/gpu/gpu_test.go +++ b/gpu/gpu_test.go @@ -9,7 +9,7 @@ import ( func TestBasicGetGPUInfo(t *testing.T) { info := GetGPUInfo() - assert.Contains(t, "cuda rocm cpu default", info.Library) + assert.Contains(t, "cuda rocm cpu metal", info.Library) switch runtime.GOOS { case "darwin": diff --git a/gpu/types.go b/gpu/types.go index abc16dbc..24fa4a24 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -11,5 +11,8 @@ type GpuInfo struct { memInfo Library string `json:"library,omitempty"` + // Optional variant to select (e.g. versions, cpu feature flags) + Variant string `json:"variant,omitempty"` + // TODO add other useful attributes about the card here for discovery information } diff --git a/llm/dynamic_shim.c b/llm/dyn_ext_server.c similarity index 82% rename from llm/dynamic_shim.c rename to llm/dyn_ext_server.c index c3e74d4a..111e4ab5 100644 --- a/llm/dynamic_shim.c +++ b/llm/dyn_ext_server.c @@ -1,4 +1,4 @@ -#include "dynamic_shim.h" +#include "dyn_ext_server.h" #include #include @@ -33,7 +33,7 @@ inline char *LOAD_ERR() { #define UNLOAD_LIBRARY(handle) dlclose(handle) #endif -void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s, +void dyn_init(const char *libPath, struct dynamic_llama_server *s, ext_server_resp_t *err) { int i = 0; struct lookup { @@ -58,7 +58,7 @@ void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s, {"", NULL}, }; - printf("Lazy loading %s library\n", libPath); + printf("loading %s library\n", libPath); s->handle = LOAD_LIBRARY(libPath, RTLD_NOW); if (!s->handle) { err->id = -1; @@ -83,63 +83,63 @@ void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s, } } -inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s, +inline void dyn_llama_server_init(struct dynamic_llama_server s, ext_server_params_t *sparams, ext_server_resp_t *err) { s.llama_server_init(sparams, err); } -inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) { +inline void dyn_llama_server_start(struct dynamic_llama_server s) { s.llama_server_start(); } -inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) { +inline void dyn_llama_server_stop(struct dynamic_llama_server s) { s.llama_server_stop(); } -inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s, +inline void dyn_llama_server_completion(struct dynamic_llama_server s, const char *json_req, ext_server_resp_t *resp) { s.llama_server_completion(json_req, resp); } -inline void dynamic_shim_llama_server_completion_next_result( +inline void dyn_llama_server_completion_next_result( struct dynamic_llama_server s, const int task_id, ext_server_task_result_t *result) { s.llama_server_completion_next_result(task_id, result); } -inline void dynamic_shim_llama_server_completion_cancel( +inline void dyn_llama_server_completion_cancel( struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) { s.llama_server_completion_cancel(task_id, err); } -inline void dynamic_shim_llama_server_release_task_result( +inline void dyn_llama_server_release_task_result( struct dynamic_llama_server s, ext_server_task_result_t *result) { s.llama_server_release_task_result(result); } -inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s, +inline void dyn_llama_server_tokenize(struct dynamic_llama_server s, const char *json_req, char **json_resp, ext_server_resp_t *err) { s.llama_server_tokenize(json_req, json_resp, err); } -inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s, +inline void dyn_llama_server_detokenize(struct dynamic_llama_server s, const char *json_req, char **json_resp, ext_server_resp_t *err) { s.llama_server_detokenize(json_req, json_resp, err); } -inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s, +inline void dyn_llama_server_embedding(struct dynamic_llama_server s, const char *json_req, char **json_resp, ext_server_resp_t *err) { s.llama_server_embedding(json_req, json_resp, err); } -inline void dynamic_shim_llama_server_release_json_resp( +inline void dyn_llama_server_release_json_resp( struct dynamic_llama_server s, char **json_resp) { s.llama_server_release_json_resp(json_resp); } diff --git a/llm/ext_server_common.go b/llm/dyn_ext_server.go similarity index 72% rename from llm/ext_server_common.go rename to llm/dyn_ext_server.go index 9a331742..105df634 100644 --- a/llm/ext_server_common.go +++ b/llm/dyn_ext_server.go @@ -10,31 +10,25 @@ package llm #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG #cgo darwin LDFLAGS: -lc++ -framework Accelerate #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders -#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libcommon.a -#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libext_server.a -#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a -#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a #cgo linux CFLAGS: -D_GNU_SOURCE -#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS -#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libext_server.a -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libcommon.a -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libllama.a -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libggml_static.a #cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm #cgo linux windows LDFLAGS: -lpthread #include -#include "ext_server.h" +#include "dyn_ext_server.h" */ import "C" + import ( "bytes" "context" "encoding/json" "fmt" "log" + "os" + "path/filepath" + "runtime" "strings" "sync" "time" @@ -43,19 +37,9 @@ import ( "github.com/jmorganca/ollama/api" ) -type extServer interface { - LLM - llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) - llama_server_start() - llama_server_stop() - llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) - llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) - llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) - llama_server_release_task_result(result *C.ext_server_task_result_t) - llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) - llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) - llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) - llama_server_release_json_resp(json_resp **C.char) +type dynExtServer struct { + s C.struct_dynamic_llama_server + options api.Options } // Note: current implementation does not support concurrent instantiations @@ -80,11 +64,30 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error { return fmt.Errorf(C.GoString(resp.msg)) } -func newExtServer(server extServer, model string, adapters, projectors []string, opts api.Options) (extServer, error) { +// Note: current implementation does not support concurrent instantiations +var llm *dynExtServer + +func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) { if !mutex.TryLock() { log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete") mutex.Lock() } + updatePath(filepath.Dir(library)) + libPath := C.CString(library) + defer C.free(unsafe.Pointer(libPath)) + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + var srv C.struct_dynamic_llama_server + C.dyn_init(libPath, &srv, &resp) + if resp.id < 0 { + mutex.Unlock() + return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg)) + } + llm = &dynExtServer{ + s: srv, + options: opts, + } + log.Printf("Loading Dynamic llm server: %s", library) var sparams C.ext_server_params_t sparams.model = C.CString(model) @@ -133,20 +136,20 @@ func newExtServer(server extServer, model string, adapters, projectors []string, sparams.n_threads = C.uint(opts.NumThread) - log.Printf("Initializing internal llama server") - resp := newExtServerResp(128) - defer freeExtServerResp(resp) - server.llama_server_init(&sparams, &resp) - if resp.id < 0 { - return nil, extServerResponseToErr(resp) + log.Printf("Initializing llama server") + initResp := newExtServerResp(128) + defer freeExtServerResp(initResp) + C.dyn_llama_server_init(llm.s, &sparams, &initResp) + if initResp.id < 0 { + return nil, extServerResponseToErr(initResp) } - log.Printf("Starting internal llama main loop") - server.llama_server_start() - return server, nil + log.Printf("Starting llama main loop") + C.dyn_llama_server_start(llm.s) + return llm, nil } -func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(PredictResult)) error { +func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error { resp := newExtServerResp(128) defer freeExtServerResp(resp) var imageData []ImageData @@ -204,7 +207,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr req := C.CString(buffer.String()) defer C.free(unsafe.Pointer(req)) - llm.llama_server_completion(req, &resp) + C.dyn_llama_server_completion(llm.s, req, &resp) if resp.id < 0 { return extServerResponseToErr(resp) } @@ -215,7 +218,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr select { case <-ctx.Done(): // This handles the request cancellation - llm.llama_server_completion_cancel(resp.id, &resp) + C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp) if resp.id < 0 { return extServerResponseToErr(resp) } else { @@ -223,13 +226,13 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr } default: var result C.ext_server_task_result_t - llm.llama_server_completion_next_result(resp.id, &result) + C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result) json_resp := C.GoString(result.json_resp) - llm.llama_server_release_task_result(&result) + C.dyn_llama_server_release_task_result(llm.s, &result) var p prediction if err := json.Unmarshal([]byte(json_resp), &p); err != nil { - llm.llama_server_completion_cancel(resp.id, &resp) + C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp) if resp.id < 0 { return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg)) } else { @@ -270,7 +273,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr return fmt.Errorf("max retries exceeded") } -func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) { +func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { data, err := json.Marshal(TokenizeRequest{Content: prompt}) if err != nil { return nil, fmt.Errorf("marshaling encode data: %w", err) @@ -280,11 +283,11 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) { var json_resp *C.char resp := newExtServerResp(128) defer freeExtServerResp(resp) - llm.llama_server_tokenize(req, &json_resp, &resp) + C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp) if resp.id < 0 { return nil, extServerResponseToErr(resp) } - defer llm.llama_server_release_json_resp(&json_resp) + defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp) var encoded TokenizeResponse if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil { @@ -294,7 +297,7 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) { return encoded.Tokens, err } -func decode(llm extServer, ctx context.Context, tokens []int) (string, error) { +func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) { if len(tokens) == 0 { return "", nil } @@ -308,11 +311,11 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) { var json_resp *C.char resp := newExtServerResp(128) defer freeExtServerResp(resp) - llm.llama_server_detokenize(req, &json_resp, &resp) + C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp) if resp.id < 0 { return "", extServerResponseToErr(resp) } - defer llm.llama_server_release_json_resp(&json_resp) + defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp) var decoded DetokenizeResponse if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil { @@ -322,7 +325,7 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) { return decoded.Content, err } -func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) { +func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) { data, err := json.Marshal(TokenizeRequest{Content: input}) if err != nil { return nil, fmt.Errorf("error marshaling embed data: %w", err) @@ -333,11 +336,11 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err var json_resp *C.char resp := newExtServerResp(128) defer freeExtServerResp(resp) - llm.llama_server_embedding(req, &json_resp, &resp) + C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp) if resp.id < 0 { return nil, extServerResponseToErr(resp) } - defer llm.llama_server_release_json_resp(&json_resp) + defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp) var embedding EmbeddingResponse if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil { @@ -347,7 +350,38 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err return embedding.Embedding, nil } -func close(llm extServer) { - llm.llama_server_stop() +func (llm *dynExtServer) Close() { + C.dyn_llama_server_stop(llm.s) mutex.Unlock() } + +func updatePath(dir string) { + if runtime.GOOS == "windows" { + tmpDir := filepath.Dir(dir) + pathComponents := strings.Split(os.Getenv("PATH"), ";") + i := 0 + for _, comp := range pathComponents { + if strings.EqualFold(comp, dir) { + return + } + // Remove any other prior paths to our temp dir + if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) { + pathComponents[i] = comp + i++ + } + } + newPath := strings.Join(append([]string{dir}, pathComponents...), ";") + log.Printf("Updating PATH to %s", newPath) + os.Setenv("PATH", newPath) + } else { + pathComponents := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":") + for _, comp := range pathComponents { + if comp == dir { + return + } + } + newPath := strings.Join(append([]string{dir}, pathComponents...), ":") + log.Printf("Updating LD_LIBRARY_PATH to %s", newPath) + os.Setenv("LD_LIBRARY_PATH", newPath) + } +} diff --git a/llm/dynamic_shim.h b/llm/dyn_ext_server.h similarity index 75% rename from llm/dynamic_shim.h rename to llm/dyn_ext_server.h index 116ca722..cddf4a1f 100644 --- a/llm/dynamic_shim.h +++ b/llm/dyn_ext_server.h @@ -27,46 +27,46 @@ struct dynamic_llama_server { void (*llama_server_release_json_resp)(char **json_resp); }; -void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s, +void dyn_init(const char *libPath, struct dynamic_llama_server *s, ext_server_resp_t *err); // No good way to call C function pointers from Go so inline the indirection -void dynamic_shim_llama_server_init(struct dynamic_llama_server s, +void dyn_llama_server_init(struct dynamic_llama_server s, ext_server_params_t *sparams, ext_server_resp_t *err); -void dynamic_shim_llama_server_start(struct dynamic_llama_server s); +void dyn_llama_server_start(struct dynamic_llama_server s); -void dynamic_shim_llama_server_stop(struct dynamic_llama_server s); +void dyn_llama_server_stop(struct dynamic_llama_server s); -void dynamic_shim_llama_server_completion(struct dynamic_llama_server s, +void dyn_llama_server_completion(struct dynamic_llama_server s, const char *json_req, ext_server_resp_t *resp); -void dynamic_shim_llama_server_completion_next_result( +void dyn_llama_server_completion_next_result( struct dynamic_llama_server s, const int task_id, ext_server_task_result_t *result); -void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s, +void dyn_llama_server_completion_cancel(struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err); -void dynamic_shim_llama_server_release_task_result( +void dyn_llama_server_release_task_result( struct dynamic_llama_server s, ext_server_task_result_t *result); -void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s, +void dyn_llama_server_tokenize(struct dynamic_llama_server s, const char *json_req, char **json_resp, ext_server_resp_t *err); -void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s, +void dyn_llama_server_detokenize(struct dynamic_llama_server s, const char *json_req, char **json_resp, ext_server_resp_t *err); -void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s, +void dyn_llama_server_embedding(struct dynamic_llama_server s, const char *json_req, char **json_resp, ext_server_resp_t *err); -void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s, +void dyn_llama_server_release_json_resp(struct dynamic_llama_server s, char **json_resp); #ifdef __cplusplus diff --git a/llm/ext_server/README.md b/llm/ext_server/README.md index ac58d9c8..bfb0d4a6 100644 --- a/llm/ext_server/README.md +++ b/llm/ext_server/README.md @@ -1,4 +1,18 @@ # Extern C Server -This directory contains a thin facade we layer on top of the Llama.cpp server -to expose `extern C` interfaces to access the functionality through direct API calls in-process +This directory contains a thin facade we layer on top of the Llama.cpp server to +expose `extern C` interfaces to access the functionality through direct API +calls in-process. The llama.cpp code uses compile time macros to configure GPU +type along with other settings. During the `go generate ./...` execution, the +build will generate one or more copies of the llama.cpp `extern C` server based +on what GPU libraries are detected to support multiple GPU types as well as CPU +only support. The Ollama go build then embeds these different servers to support +different GPUs and settings at runtime. + +If you are making changes to the code in this directory, make sure to disable +caching during your go build to ensure you pick up your changes. A typical +iteration cycle from the top of the source tree looks like: + +``` +go generate ./... && go build -a . +``` \ No newline at end of file diff --git a/llm/ext_server_default.go b/llm/ext_server_default.go deleted file mode 100644 index 05287383..00000000 --- a/llm/ext_server_default.go +++ /dev/null @@ -1,80 +0,0 @@ -//go:build !windows - -package llm - -/* -#include -#include "ext_server.h" - -*/ -import "C" -import ( - "context" - - "github.com/jmorganca/ollama/api" -) - -type llamaExtServer struct { - api.Options -} - -func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) { - C.llama_server_init(sparams, err) -} -func (llm *llamaExtServer) llama_server_start() { - C.llama_server_start() -} -func (llm *llamaExtServer) llama_server_stop() { - C.llama_server_stop() -} - -func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) { - C.llama_server_completion(json_req, resp) -} -func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) { - C.llama_server_completion_next_result(task_id, resp) -} -func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) { - C.llama_server_completion_cancel(task_id, err) -} -func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) { - C.llama_server_release_task_result(result) -} - -func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.llama_server_tokenize(json_req, json_resp, err) -} -func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.llama_server_detokenize(json_req, json_resp, err) -} -func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.llama_server_embedding(json_req, json_resp, err) -} -func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) { - C.llama_server_release_json_resp(json_resp) -} - -func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) { - server := &llamaExtServer{opts} - return newExtServer(server, model, adapters, projectors, opts) -} - -func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error { - return predict(ctx, llm, pred, fn) -} - -func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { - return encode(llm, ctx, prompt) -} - -func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) { - return decode(llm, ctx, tokens) -} - -func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) { - return embedding(llm, ctx, input) -} - -func (llm *llamaExtServer) Close() { - close(llm) -} diff --git a/llm/ext_server_windows.go b/llm/ext_server_windows.go deleted file mode 100644 index 39b5f096..00000000 --- a/llm/ext_server_windows.go +++ /dev/null @@ -1,12 +0,0 @@ -package llm - -import ( - "github.com/jmorganca/ollama/api" -) - -func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) { - // On windows we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies - // This ensures we can update the PATH at runtime to get everything loaded - - return newDynamicShimExtServer(AvailableShims["cpu"], model, adapters, projectors, opts) -} diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index ac91f1aa..d7bafa5b 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -51,6 +51,16 @@ install() { cp ${BUILD_DIR}/libggml_static.a ${BUILD_DIR}/lib } +link_server_lib() { + gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \ + -Wl,--whole-archive \ + ${BUILD_DIR}/lib/libext_server.a \ + -Wl,--no-whole-archive \ + ${BUILD_DIR}/lib/libcommon.a \ + ${BUILD_DIR}/lib/libllama.a + +} + # Keep the local tree clean after we're done with the build cleanup() { (cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp) diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index cabd8f75..b7f1f684 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -29,4 +29,16 @@ git_module_setup apply_patches build install +gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \ + -Wl,-force_load ${BUILD_DIR}/lib/libext_server.a \ + ${BUILD_DIR}/lib/libcommon.a \ + ${BUILD_DIR}/lib/libllama.a \ + ${BUILD_DIR}/lib/libggml_static.a \ + -lpthread -ldl -lm -lc++ \ + -framework Accelerate \ + -framework Foundation \ + -framework Metal \ + -framework MetalKit \ + -framework MetalPerformanceShaders + cleanup diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 52081156..0c940ba5 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -48,23 +48,76 @@ init_vars git_module_setup apply_patches -# -# CPU first for the default library -# -CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" -BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu" +if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then + # Users building from source can tune the exact flags we pass to cmake for configuring + # llama.cpp, and we'll build only 1 CPU variant in that case as the default. + if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then + echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" + CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu" + echo "Building custom CPU" + build + install + link_server_lib + else + # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 + # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer + # -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX) + # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen + # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver + # Note: the following seem to yield slower results than AVX2 - ymmv + # -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT) + # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake + # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake -build -install + COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off" + # + # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) + # + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu" + echo "Building LCD CPU" + build + install + link_server_lib -# Placeholder to keep go embed happy until we start building dynamic CPU lib variants -touch ${BUILD_DIR}/lib/dummy.so + # + # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance + # Approximately 400% faster than LCD on same CPU + # + init_vars + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx" + echo "Building AVX CPU" + build + install + link_server_lib + + # + # ~2013 CPU Dynamic library + # Approximately 10% faster than AVX on same CPU + # + init_vars + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx2" + echo "Building AVX2 CPU" + build + install + link_server_lib + fi +else + echo "Skipping CPU generation step as requested" +fi if [ -d /usr/local/cuda/lib64/ ]; then echo "CUDA libraries detected - building dynamic CUDA library" init_vars + CUDA_MAJOR=$(ls /usr/local/cuda/lib64/libcudart.so.* | head -1 | cut -f3 -d. || true) + if [ -n "${CUDA_MAJOR}" ]; then + CUDA_VARIANT=_v${CUDA_MAJOR} + fi CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda${CUDA_VARIANT}" CUDA_LIB_DIR=/usr/local/cuda/lib64 build install @@ -96,9 +149,12 @@ fi if [ -d "${ROCM_PATH}" ]; then echo "ROCm libraries detected - building dynamic ROCm library" + if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then + ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true) + fi init_vars CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm${ROCM_VARIANT}" build install gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \ diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 9435fffa..1bc08c69 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -4,7 +4,7 @@ $ErrorActionPreference = "Stop" function init_vars { $script:llamacppDir = "../llama.cpp" - $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64") + $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-A","x64") $script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static") if ($env:CGO_CFLAGS -contains "-g") { $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on") @@ -63,16 +63,36 @@ init_vars git_module_setup apply_patches -# first build CPU based -$script:buildDir="${script:llamacppDir}/build/windows/cpu" +# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer +# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX) +# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen +# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver +$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off") + +$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs +$script:buildDir="${script:llamacppDir}/build/windows/cpu" +write-host "Building LCD CPU" +build +install + +$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs +$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx" +write-host "Building AVX CPU" +build +install + +$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs +$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx2" +write-host "Building AVX2 CPU" build install # Then build cuda as a dynamically loaded library +# TODO figure out how to detect cuda version init_vars $script:buildDir="${script:llamacppDir}/build/windows/cuda" -$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON") +$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on") build install diff --git a/llm/llm.go b/llm/llm.go index 4f709538..d7667675 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -18,8 +18,6 @@ type LLM interface { Close() } -var AvailableShims = map[string]string{} - func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) { if _, err := os.Stat(model); err != nil { return nil, err @@ -112,7 +110,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) opts.RopeFrequencyBase = 0.0 opts.RopeFrequencyScale = 0.0 - return newLlmServer(library, model, adapters, projectors, opts) + gpuInfo := gpu.GetGPUInfo() + return newLlmServer(gpuInfo, model, adapters, projectors, opts) } // Give any native cgo implementations an opportunity to initialize @@ -120,15 +119,30 @@ func Init(workdir string) error { return nativeInit(workdir) } -func newLlmServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) { - if _, libPresent := AvailableShims[library]; libPresent && library != "default" { - srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, opts) +func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) { + dynLibs := getDynLibs(gpuInfo) + + // Check to see if the user has requested a specific library instead of auto-detecting + demandLib := os.Getenv("OLLAMA_LLM_LIBRARY") + if demandLib != "" { + libPath := availableDynLibs[demandLib] + if libPath == "" { + log.Printf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib) + } else { + log.Printf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib) + dynLibs = []string{libPath} + } + } + + err2 := fmt.Errorf("unable to locate suitable llm library") + for _, dynLib := range dynLibs { + srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts) if err == nil { return srv, nil } - log.Printf("Failed to load dynamic library %s - falling back to CPU mode %s", library, err) - // TODO - update some state to indicate we were unable to load the GPU library for future "info" ux + log.Printf("Failed to load dynamic library %s %s", dynLib, err) + err2 = err } - return newDefaultExtServer(model, adapters, projectors, opts) + return nil, err2 } diff --git a/llm/payload_common.go b/llm/payload_common.go new file mode 100644 index 00000000..f6976768 --- /dev/null +++ b/llm/payload_common.go @@ -0,0 +1,244 @@ +package llm + +import ( + "errors" + "fmt" + "io" + "io/fs" + "log" + "os" + "path/filepath" + "runtime" + "slices" + "strings" + + "github.com/jmorganca/ollama/gpu" +) + +// Libraries names may contain an optional variant separated by '_' +// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2" +// Any library without a variant is the lowest common denominator +var availableDynLibs = map[string]string{} + +const pathComponentCount = 6 + +// getDynLibs returns an ordered list of LLM libraries to try, starting with the best +func getDynLibs(gpuInfo gpu.GpuInfo) []string { + // Short circuit if we know we're using the default built-in (darwin only) + if gpuInfo.Library == "default" { + return []string{"default"} + } + + exactMatch := "" + dynLibs := []string{} + altDynLibs := []string{} + requested := gpuInfo.Library + if gpuInfo.Variant != "" { + requested += "_" + gpuInfo.Variant + } + // Try to find an exact match + for cmp := range availableDynLibs { + if requested == cmp { + exactMatch = cmp + dynLibs = []string{availableDynLibs[cmp]} + break + } + } + // Then for GPUs load alternates and sort the list for consistent load ordering + if gpuInfo.Library != "cpu" { + for cmp := range availableDynLibs { + if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch { + altDynLibs = append(altDynLibs, cmp) + } + } + slices.Sort(altDynLibs) + for _, altDynLib := range altDynLibs { + dynLibs = append(dynLibs, availableDynLibs[altDynLib]) + } + } + + // Load up the best CPU variant if not primary requested + if gpuInfo.Library != "cpu" { + variant := gpu.GetCPUVariant() + // If no variant, then we fall back to default + // If we have a variant, try that if we find an exact match + // Attempting to run the wrong CPU instructions will panic the + // process + if variant != "" { + for cmp := range availableDynLibs { + if cmp == "cpu_"+variant { + dynLibs = append(dynLibs, availableDynLibs[cmp]) + break + } + } + } else { + dynLibs = append(dynLibs, availableDynLibs["cpu"]) + } + } + + // Finaly, if we didn't find any matches, LCD CPU FTW + if len(dynLibs) == 0 { + dynLibs = []string{availableDynLibs["cpu"]} + } + return dynLibs +} + +func rocmDynLibPresent() bool { + for dynLibName := range availableDynLibs { + if strings.HasPrefix(dynLibName, "rocm") { + return true + } + } + return false +} + +func nativeInit(workdir string) error { + if runtime.GOOS == "darwin" { + err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal") + if err != nil { + if err == payloadMissing { + // TODO perhaps consider this a hard failure on arm macs? + log.Printf("ggml-meta.metal payload missing") + return nil + } + return err + } + os.Setenv("GGML_METAL_PATH_RESOURCES", workdir) + } + + libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*") + if err != nil { + if err == payloadMissing { + log.Printf("%s", payloadMissing) + return nil + } + return err + } + for _, lib := range libs { + // The last dir component is the variant name + variant := filepath.Base(filepath.Dir(lib)) + availableDynLibs[variant] = lib + } + + if err := verifyDriverAccess(); err != nil { + return err + } + + // Report which dynamic libraries we have loaded to assist troubleshooting + variants := make([]string, len(availableDynLibs)) + i := 0 + for variant := range availableDynLibs { + variants[i] = variant + i++ + } + log.Printf("Dynamic LLM libraries %v", variants) + log.Printf("Override detection logic by setting OLLAMA_LLM_LIBRARY") + + return nil +} + +func extractDynamicLibs(workDir, glob string) ([]string, error) { + files, err := fs.Glob(libEmbed, glob) + if err != nil || len(files) == 0 { + return nil, payloadMissing + } + libs := []string{} + + for _, file := range files { + pathComps := strings.Split(file, "/") + if len(pathComps) != pathComponentCount { + log.Printf("unexpected payload components: %v", pathComps) + continue + } + // llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY + // Include the variant in the path to avoid conflicts between multiple server libs + targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3]) + srcFile, err := libEmbed.Open(file) + if err != nil { + return nil, fmt.Errorf("read payload %s: %v", file, err) + } + defer srcFile.Close() + if err := os.MkdirAll(targetDir, 0o755); err != nil { + return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err) + } + + destFile := filepath.Join(targetDir, filepath.Base(file)) + if strings.Contains(destFile, "server") { + libs = append(libs, destFile) + } + + _, err = os.Stat(destFile) + switch { + case errors.Is(err, os.ErrNotExist): + destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) + if err != nil { + return nil, fmt.Errorf("write payload %s: %v", file, err) + } + defer destFile.Close() + if _, err := io.Copy(destFile, srcFile); err != nil { + return nil, fmt.Errorf("copy payload %s: %v", file, err) + } + case err != nil: + return nil, fmt.Errorf("stat payload %s: %v", file, err) + } + } + return libs, nil +} + +func extractPayloadFiles(workDir, glob string) error { + files, err := fs.Glob(libEmbed, glob) + if err != nil || len(files) == 0 { + return payloadMissing + } + + for _, file := range files { + srcFile, err := libEmbed.Open(file) + if err != nil { + return fmt.Errorf("read payload %s: %v", file, err) + } + defer srcFile.Close() + if err := os.MkdirAll(workDir, 0o755); err != nil { + return fmt.Errorf("create payload temp dir %s: %v", workDir, err) + } + + destFile := filepath.Join(workDir, filepath.Base(file)) + _, err = os.Stat(destFile) + switch { + case errors.Is(err, os.ErrNotExist): + destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) + if err != nil { + return fmt.Errorf("write payload %s: %v", file, err) + } + defer destFile.Close() + if _, err := io.Copy(destFile, srcFile); err != nil { + return fmt.Errorf("copy payload %s: %v", file, err) + } + case err != nil: + return fmt.Errorf("stat payload %s: %v", file, err) + } + } + return nil +} + +func verifyDriverAccess() error { + if runtime.GOOS != "linux" { + return nil + } + // Only check ROCm access if we have the dynamic lib loaded + if rocmDynLibPresent() { + // Verify we have permissions - either running as root, or we have group access to the driver + fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666) + if err != nil { + if errors.Is(err, fs.ErrPermission) { + return fmt.Errorf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.") + } else if errors.Is(err, fs.ErrNotExist) { + // expected behavior without a radeon card + return nil + } + + return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) + } + fd.Close() + } + return nil +} diff --git a/llm/payload_darwin.go b/llm/payload_darwin.go new file mode 100644 index 00000000..1a5f042a --- /dev/null +++ b/llm/payload_darwin.go @@ -0,0 +1,8 @@ +package llm + +import ( + "embed" +) + +//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/*/lib/*.so +var libEmbed embed.FS diff --git a/llm/payload_linux.go b/llm/payload_linux.go new file mode 100644 index 00000000..afef040a --- /dev/null +++ b/llm/payload_linux.go @@ -0,0 +1,8 @@ +package llm + +import ( + "embed" +) + +//go:embed llama.cpp/build/linux/*/lib/*.so +var libEmbed embed.FS diff --git a/llm/payload_test.go b/llm/payload_test.go new file mode 100644 index 00000000..7a644713 --- /dev/null +++ b/llm/payload_test.go @@ -0,0 +1,54 @@ +package llm + +import ( + "testing" + + "github.com/jmorganca/ollama/gpu" + "github.com/stretchr/testify/assert" +) + +func TestGetDynLibs(t *testing.T) { + availableDynLibs = map[string]string{ + "cpu": "X_cpu", + } + assert.Equal(t, false, rocmDynLibPresent()) + res := getDynLibs(gpu.GpuInfo{Library: "cpu"}) + assert.Len(t, res, 1) + assert.Equal(t, availableDynLibs["cpu"], res[0]) + + availableDynLibs = map[string]string{ + "rocm_v5": "X_rocm_v5", + "rocm_v6": "X_rocm_v6", + "cpu": "X_cpu", + } + assert.Equal(t, true, rocmDynLibPresent()) + res = getDynLibs(gpu.GpuInfo{Library: "rocm"}) + assert.Len(t, res, 3) + assert.Equal(t, availableDynLibs["rocm_v5"], res[0]) + assert.Equal(t, availableDynLibs["rocm_v6"], res[1]) + assert.Equal(t, availableDynLibs["cpu"], res[2]) + + res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"}) + assert.Len(t, res, 3) + assert.Equal(t, availableDynLibs["rocm_v6"], res[0]) + assert.Equal(t, availableDynLibs["rocm_v5"], res[1]) + assert.Equal(t, availableDynLibs["cpu"], res[2]) + + res = getDynLibs(gpu.GpuInfo{Library: "cuda"}) + assert.Len(t, res, 1) + assert.Equal(t, availableDynLibs["cpu"], res[0]) + + res = getDynLibs(gpu.GpuInfo{Library: "default"}) + assert.Len(t, res, 1) + assert.Equal(t, "default", res[0]) + + availableDynLibs = map[string]string{ + "rocm": "X_rocm_v5", + "cpu": "X_cpu", + } + assert.Equal(t, true, rocmDynLibPresent()) + res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"}) + assert.Len(t, res, 2) + assert.Equal(t, availableDynLibs["rocm"], res[0]) + assert.Equal(t, availableDynLibs["cpu"], res[1]) +} diff --git a/llm/payload_windows.go b/llm/payload_windows.go new file mode 100644 index 00000000..21c6cc4d --- /dev/null +++ b/llm/payload_windows.go @@ -0,0 +1,8 @@ +package llm + +import ( + "embed" +) + +//go:embed llama.cpp/build/windows/*/lib/*.dll +var libEmbed embed.FS diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go deleted file mode 100644 index 3baafd1e..00000000 --- a/llm/shim_darwin.go +++ /dev/null @@ -1,71 +0,0 @@ -package llm - -import ( - "embed" - "errors" - "fmt" - "io" - "io/fs" - "log" - "os" - "path/filepath" - - "github.com/jmorganca/ollama/api" -) - -//go:embed llama.cpp/ggml-metal.metal -var libEmbed embed.FS - -func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) { - // should never happen... - return nil, fmt.Errorf("Dynamic library loading not supported on Mac") -} - -func nativeInit(workdir string) error { - err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal") - if err != nil { - if err == payloadMissing { - // TODO perhaps consider this a hard failure on arm macs? - log.Printf("ggml-meta.metal payload missing") - return nil - } - return err - } - os.Setenv("GGML_METAL_PATH_RESOURCES", workdir) - return nil -} - -func extractPayloadFiles(workDir, glob string) error { - files, err := fs.Glob(libEmbed, glob) - if err != nil || len(files) == 0 { - return payloadMissing - } - - for _, file := range files { - srcFile, err := libEmbed.Open(file) - if err != nil { - return fmt.Errorf("read payload %s: %v", file, err) - } - defer srcFile.Close() - if err := os.MkdirAll(workDir, 0o755); err != nil { - return fmt.Errorf("create payload temp dir %s: %v", workDir, err) - } - - destFile := filepath.Join(workDir, filepath.Base(file)) - _, err = os.Stat(destFile) - switch { - case errors.Is(err, os.ErrNotExist): - destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) - if err != nil { - return fmt.Errorf("write payload %s: %v", file, err) - } - defer destFile.Close() - if _, err := io.Copy(destFile, srcFile); err != nil { - return fmt.Errorf("copy payload %s: %v", file, err) - } - case err != nil: - return fmt.Errorf("stat payload %s: %v", file, err) - } - } - return nil -} diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go deleted file mode 100644 index dca7b38d..00000000 --- a/llm/shim_ext_server.go +++ /dev/null @@ -1,193 +0,0 @@ -//go:build !darwin - -package llm - -/* - -#include -#include "dynamic_shim.h" - -*/ -import "C" -import ( - "context" - "errors" - "fmt" - "io" - "io/fs" - "log" - "os" - "path/filepath" - "strings" - "sync" - "unsafe" - - "github.com/jmorganca/ollama/api" -) - -type shimExtServer struct { - s C.struct_dynamic_llama_server - options api.Options -} - -// Note: current implementation does not support concurrent instantiations -var shimMutex sync.Mutex -var llm *shimExtServer - -const pathComponentCount = 6 - -func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) { - C.dynamic_shim_llama_server_init(llm.s, sparams, err) -} -func (llm *shimExtServer) llama_server_start() { - C.dynamic_shim_llama_server_start(llm.s) -} -func (llm *shimExtServer) llama_server_stop() { - C.dynamic_shim_llama_server_stop(llm.s) -} - -func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) { - C.dynamic_shim_llama_server_completion(llm.s, json_req, resp) -} -func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) { - C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp) -} -func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) { - C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err) -} -func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) { - C.dynamic_shim_llama_server_release_task_result(llm.s, result) -} - -func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err) -} -func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err) -} -func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err) -} -func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) { - C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp) -} - -func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) { - shimMutex.Lock() - defer shimMutex.Unlock() - updatePath(filepath.Dir(library)) - libPath := C.CString(library) - defer C.free(unsafe.Pointer(libPath)) - resp := newExtServerResp(128) - defer freeExtServerResp(resp) - var srv C.struct_dynamic_llama_server - C.dynamic_shim_init(libPath, &srv, &resp) - if resp.id < 0 { - return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg)) - } - llm = &shimExtServer{ - s: srv, - options: opts, - } - log.Printf("Loading Dynamic Shim llm server: %s", library) - return newExtServer(llm, model, adapters, projectors, opts) -} - -func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error { - return predict(ctx, llm, pred, fn) -} - -func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { - return encode(llm, ctx, prompt) -} - -func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) { - return decode(llm, ctx, tokens) -} - -func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) { - return embedding(llm, ctx, input) -} - -func (llm *shimExtServer) Close() { - close(llm) -} - -func nativeInit(workdir string) error { - libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*") - if err != nil { - if err == payloadMissing { - log.Printf("%s", payloadMissing) - return nil - } - return err - } - for _, lib := range libs { - // The last dir component is the variant name - variant := filepath.Base(filepath.Dir(lib)) - AvailableShims[variant] = lib - } - - if err := verifyDriverAccess(); err != nil { - return err - } - - // Report which dynamic libraries we have loaded to assist troubleshooting - variants := make([]string, len(AvailableShims)) - i := 0 - for variant := range AvailableShims { - variants[i] = variant - i++ - } - log.Printf("Dynamic LLM variants %v", variants) - - return nil -} - -func extractDynamicLibs(workDir, glob string) ([]string, error) { - files, err := fs.Glob(libEmbed, glob) - if err != nil || len(files) == 0 { - return nil, payloadMissing - } - libs := []string{} - - for _, file := range files { - pathComps := strings.Split(file, "/") - if len(pathComps) != pathComponentCount { - log.Printf("unexpected payload components: %v", pathComps) - continue - } - // llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY - // Include the variant in the path to avoid conflicts between multiple server libs - targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3]) - srcFile, err := libEmbed.Open(file) - if err != nil { - return nil, fmt.Errorf("read payload %s: %v", file, err) - } - defer srcFile.Close() - if err := os.MkdirAll(targetDir, 0o755); err != nil { - return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err) - } - - destFile := filepath.Join(targetDir, filepath.Base(file)) - if strings.Contains(destFile, "server") { - libs = append(libs, destFile) - } - - _, err = os.Stat(destFile) - switch { - case errors.Is(err, os.ErrNotExist): - destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) - if err != nil { - return nil, fmt.Errorf("write payload %s: %v", file, err) - } - defer destFile.Close() - if _, err := io.Copy(destFile, srcFile); err != nil { - return nil, fmt.Errorf("copy payload %s: %v", file, err) - } - case err != nil: - return nil, fmt.Errorf("stat payload %s: %v", file, err) - } - } - return libs, nil -} diff --git a/llm/shim_ext_server_linux.go b/llm/shim_ext_server_linux.go deleted file mode 100644 index e0ad5da4..00000000 --- a/llm/shim_ext_server_linux.go +++ /dev/null @@ -1,46 +0,0 @@ -package llm - -import ( - "embed" - "errors" - "fmt" - "io/fs" - "log" - "os" - "strings" -) - -//go:embed llama.cpp/build/*/*/lib/*.so -var libEmbed embed.FS - -func updatePath(dir string) { - pathComponents := strings.Split(os.Getenv("PATH"), ":") - for _, comp := range pathComponents { - if comp == dir { - return - } - } - newPath := strings.Join(append(pathComponents, dir), ":") - log.Printf("Updating PATH to %s", newPath) - os.Setenv("PATH", newPath) -} - -func verifyDriverAccess() error { - // Only check ROCm access if we have the dynamic lib loaded - if _, rocmPresent := AvailableShims["rocm"]; rocmPresent { - // Verify we have permissions - either running as root, or we have group access to the driver - fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666) - if err != nil { - if errors.Is(err, fs.ErrPermission) { - return fmt.Errorf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.") - } else if errors.Is(err, fs.ErrNotExist) { - // expected behavior without a radeon card - return nil - } - - return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) - } - fd.Close() - } - return nil -} diff --git a/llm/shim_ext_server_windows.go b/llm/shim_ext_server_windows.go deleted file mode 100644 index e95c8afa..00000000 --- a/llm/shim_ext_server_windows.go +++ /dev/null @@ -1,36 +0,0 @@ -package llm - -import ( - "embed" - "log" - "os" - "path/filepath" - "strings" -) - -//go:embed llama.cpp/build/windows/*/lib/*.dll -var libEmbed embed.FS - -func updatePath(dir string) { - tmpDir := filepath.Dir(dir) - pathComponents := strings.Split(os.Getenv("PATH"), ";") - i := 0 - for _, comp := range pathComponents { - if strings.EqualFold(comp, dir) { - return - } - // Remove any other prior paths to our temp dir - if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) { - pathComponents[i] = comp - i++ - } - } - newPath := strings.Join(append([]string{dir}, pathComponents...), ";") - log.Printf("Updating PATH to %s", newPath) - os.Setenv("PATH", newPath) -} - -func verifyDriverAccess() error { - // TODO if applicable - return nil -} diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 846103ea..582899f7 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -9,7 +9,7 @@ BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"} mkdir -p dist for TARGETARCH in ${BUILD_ARCH}; do - docker build --platform=linux/$TARGETARCH --build-arg=GOFLAGS --build-arg=CGO_CFLAGS -f Dockerfile.build -t builder:$TARGETARCH . + docker build --platform=linux/$TARGETARCH --build-arg=GOFLAGS --build-arg=CGO_CFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS -f Dockerfile.build -t builder:$TARGETARCH . docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH docker rm builder-$TARGETARCH diff --git a/scripts/rh_linux_deps.sh b/scripts/rh_linux_deps.sh new file mode 100644 index 00000000..ec6b20a0 --- /dev/null +++ b/scripts/rh_linux_deps.sh @@ -0,0 +1,43 @@ +#!/bin/sh + +# Script for common Dockerfile dependency installation in redhat linux based images + +set -ex +MACHINE=$(uname -m) + +if grep -i "centos" /etc/system-release >/dev/null; then + # Centos 7 derivatives have too old of a git version to run our generate script + # uninstall and ignore failures + yum remove -y git + yum -y install epel-release centos-release-scl + yum -y install dnf + if [ "${MACHINE}" = "x86_64" ]; then + yum -y install https://repo.ius.io/ius-release-el7.rpm + dnf install -y git236 + else + dnf install -y rh-git227-git + ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git + fi + dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ +elif grep -i "rocky" /etc/system-release >/dev/null; then + dnf install -y git gcc-toolset-10-gcc gcc-toolset-10-gcc-c++ +else + echo "ERROR Unexpected distro" + exit 1 +fi + +if [ -n "${CMAKE_VERSION}" ]; then + curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 +fi + +if [ -n "${GOLANG_VERSION}" ]; then + if [ "${MACHINE}" = "x86_64" ]; then + GO_ARCH="amd64" + else + GO_ARCH="arm64" + fi + mkdir -p /usr/local + curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-${GO_ARCH}.tar.gz | tar xz -C /usr/local + ln -s /usr/local/go/bin/go /usr/local/bin/go + ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt +fi