From 58d95cc9bd446a8209e7388a96c70367cbafd653 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 14 Mar 2024 10:24:13 -0700 Subject: [PATCH 1/7] Switch back to subprocessing for llama.cpp This should resolve a number of memory leak and stability defects by allowing us to isolate llama.cpp in a separate process and shutdown when idle, and gracefully restart if it has problems. This also serves as a first step to be able to run multiple copies to support multiple models concurrently. --- .github/workflows/test.yaml | 42 +- .gitignore | 3 +- Dockerfile | 25 +- app/lifecycle/server.go | 25 +- gpu/amd_linux.go | 17 + gpu/assets.go | 8 +- llm/dyn_ext_server.c | 142 --- llm/dyn_ext_server.go | 388 -------- llm/dyn_ext_server.h | 74 -- llm/ext_server/CMakeLists.txt | 27 +- llm/ext_server/README.md | 18 - llm/ext_server/ext_server.cpp | 377 -------- llm/ext_server/ext_server.h | 95 -- llm/ext_server/server.cpp | 2 +- llm/generate/gen_common.sh | 29 +- llm/generate/gen_darwin.sh | 47 +- llm/generate/gen_linux.sh | 63 +- llm/generate/gen_windows.ps1 | 98 +- llm/generate/generate_darwin.go | 2 +- llm/llama.go | 100 -- llm/llm.go | 190 +--- llm/{payload_linux.go => llm_darwin_amd64.go} | 2 +- ...payload_windows.go => llm_darwin_arm64.go} | 2 +- llm/llm_linux.go | 6 + llm/llm_windows.go | 6 + llm/payload.go | 211 +++++ llm/payload_common.go | 233 ----- llm/payload_darwin_amd64.go | 8 - llm/payload_darwin_arm64.go | 8 - llm/payload_test.go | 58 -- llm/server.go | 854 ++++++++++++++++++ llm/status.go | 42 + llm/utils.go | 15 - server/routes.go | 82 +- server/routes_test.go | 27 +- 35 files changed, 1416 insertions(+), 1910 deletions(-) delete mode 100644 llm/dyn_ext_server.c delete mode 100644 llm/dyn_ext_server.go delete mode 100644 llm/dyn_ext_server.h delete mode 100644 llm/ext_server/README.md delete mode 100644 llm/ext_server/ext_server.cpp delete mode 100644 llm/ext_server/ext_server.h delete mode 100644 llm/llama.go rename llm/{payload_linux.go => llm_darwin_amd64.go} (56%) rename llm/{payload_windows.go => llm_darwin_arm64.go} (52%) create mode 100644 llm/llm_linux.go create mode 100644 llm/llm_windows.go create mode 100644 llm/payload.go delete mode 100644 llm/payload_common.go delete mode 100644 llm/payload_darwin_amd64.go delete mode 100644 llm/payload_darwin_arm64.go delete mode 100644 llm/payload_test.go create mode 100644 llm/server.go create mode 100644 llm/status.go delete mode 100644 llm/utils.go diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 41e1879f..752c8ebf 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -56,10 +56,12 @@ jobs: - run: go get ./... - run: | $gopath=(get-command go).source | split-path -parent + $gccpath=(get-command gcc).source | split-path -parent & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" cd $env:GITHUB_WORKSPACE $env:CMAKE_SYSTEM_VERSION="10.0.22621.0" - $env:PATH="$gopath;$env:PATH" + $env:PATH="$gopath;$gccpath;$env:PATH" + echo $env:PATH go generate -x ./... if: ${{ startsWith(matrix.os, 'windows-') }} name: "Windows Go Generate" @@ -69,7 +71,9 @@ jobs: - uses: actions/upload-artifact@v4 with: name: ${{ matrix.os }}-${{ matrix.arch }}-libraries - path: llm/llama.cpp/build/**/lib/* + path: | + llm/build/**/bin/* + llm/build/**/*.a generate-cuda: needs: [changes] if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }} @@ -100,7 +104,7 @@ jobs: - uses: actions/upload-artifact@v4 with: name: cuda-${{ matrix.cuda-version }}-libraries - path: llm/llama.cpp/build/**/lib/* + path: llm/build/**/bin/* generate-rocm: needs: [changes] if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }} @@ -131,7 +135,7 @@ jobs: - uses: actions/upload-artifact@v4 with: name: rocm-${{ matrix.rocm-version }}-libraries - path: llm/llama.cpp/build/**/lib/* + path: llm/build/**/lib/* # ROCm generation step generate-windows-rocm: @@ -244,17 +248,17 @@ jobs: esac >>$GITHUB_ENV shell: bash - run: | - mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/ - touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so + mkdir -p llm/build/linux/$ARCH/stub/bin/ + touch llm/build/linux/$ARCH/stub/bin/stub.so if: ${{ startsWith(matrix.os, 'ubuntu-') }} - run: | - mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/ - touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib - touch llm/llama.cpp/ggml-metal.metal + mkdir -p llm/build/darwin/$ARCH/stub/bin/ + touch llm/build/darwin/$ARCH/stub/bin/stub.dylib + touch llm/ggml-metal.metal if: ${{ startsWith(matrix.os, 'macos-') }} - run: | - mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/ - touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll + mkdir -p llm/build/windows/$ARCH/stub/stub/bin/ + touch llm/build/windows/$ARCH/stub/stub/bin/stub.dll if: ${{ startsWith(matrix.os, 'windows-') }} - uses: golangci/golangci-lint-action@v3 test: @@ -271,6 +275,7 @@ jobs: env: GOARCH: ${{ matrix.arch }} CGO_ENABLED: '1' + OLLAMA_CPU_TARGET: "static" steps: - uses: actions/checkout@v4 with: @@ -287,18 +292,19 @@ jobs: esac >>$GITHUB_ENV shell: bash - run: | - mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/ - touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so + mkdir -p llm/build/linux/$ARCH/stub/bin/ + touch llm//build/linux/$ARCH/stub/bin/stub.so if: ${{ startsWith(matrix.os, 'ubuntu-') }} - run: | - mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/ - touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib - touch llm/llama.cpp/ggml-metal.metal + mkdir -p llm/build/darwin/$ARCH/stub/bin/ + touch llm/build/darwin/$ARCH/stub/bin/stub.dylib + touch llm/ggml-metal.metal if: ${{ startsWith(matrix.os, 'macos-') }} - run: | - mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/ - touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll + mkdir -p llm/build/windows/$ARCH/stub/stub/bin/ + touch llm/build/windows/$ARCH/stub/stub/bin/stub.dll if: ${{ startsWith(matrix.os, 'windows-') }} + - run: go generate ./... - run: go build - run: go test -v ./... - uses: actions/upload-artifact@v4 diff --git a/.gitignore b/.gitignore index 388175f7..e0362a19 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ ggml-metal.metal *.exe .idea test_data -*.crt \ No newline at end of file +*.crt +llm/build \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 41081ce7..a3267ffd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -61,6 +61,8 @@ ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS WORKDIR /go/src/github.com/ollama/ollama/llm/generate +FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64 +RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64 RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64 @@ -68,28 +70,33 @@ RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64 RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh -FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64 +FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64 ARG CMAKE_VERSION ARG GOLANG_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ -WORKDIR /go/src/github.com/ollama/ollama/llm/generate -# Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS +WORKDIR /go/src/github.com/ollama/ollama/llm/generate + +FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64 +RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh +FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64 RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh + # Intermediate stage used for ./scripts/build_linux.sh FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64 ENV CGO_ENABLED 1 WORKDIR /go/src/github.com/ollama/ollama COPY . . -COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ -COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ -COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ -COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ +COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/ ARG GOFLAGS ARG CGO_CFLAGS @@ -101,8 +108,8 @@ ENV CGO_ENABLED 1 ARG GOLANG_VERSION WORKDIR /go/src/github.com/ollama/ollama COPY . . -COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ -RUN mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ +COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ARG GOFLAGS ARG CGO_CFLAGS RUN go build -trimpath . diff --git a/app/lifecycle/server.go b/app/lifecycle/server.go index e3ca22f9..0ce90df9 100644 --- a/app/lifecycle/server.go +++ b/app/lifecycle/server.go @@ -9,6 +9,7 @@ import ( "os" "os/exec" "path/filepath" + "syscall" "time" "github.com/ollama/ollama/api" @@ -83,6 +84,28 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) { io.Copy(logFile, stderr) //nolint:errcheck }() + // Re-wire context done behavior to attempt a graceful shutdown of the server + cmd.Cancel = func() error { + if cmd.Process != nil { + cmd.Process.Signal(os.Interrupt) //nolint:errcheck + tick := time.NewTicker(10 * time.Millisecond) + defer tick.Stop() + for { + select { + case <-tick.C: + // OS agnostic "is it still running" + if proc, err := os.FindProcess(int(cmd.Process.Pid)); err != nil || errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) { + return nil //nolint:nilerr + } + case <-time.After(5 * time.Second): + slog.Warn("graceful server shutdown timeout, killing", "pid", cmd.Process.Pid) + cmd.Process.Kill() //nolint:errcheck + } + } + } + return nil + } + // run the command and wait for it to finish if err := cmd.Start(); err != nil { return done, fmt.Errorf("failed to start server %w", err) @@ -105,7 +128,7 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) { select { case <-ctx.Done(): - slog.Debug(fmt.Sprintf("server shutdown with exit code %d", code)) + slog.Info(fmt.Sprintf("server shutdown with exit code %d", code)) done <- code return default: diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index 27ae679f..529fb8db 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -100,6 +100,8 @@ func AMDGetGPUInfo(resp *GpuInfo) { return } + updateLibPath(libDir) + gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION") if gfxOverride == "" { supported, err := GetSupportedGFX(libDir) @@ -143,6 +145,21 @@ func AMDGetGPUInfo(resp *GpuInfo) { } } +func updateLibPath(libDir string) { + ldPaths := []string{} + if val, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok { + ldPaths = strings.Split(val, ":") + } + for _, d := range ldPaths { + if d == libDir { + return + } + } + val := strings.Join(append(ldPaths, libDir), ":") + slog.Debug("updated lib path", "LD_LIBRARY_PATH", val) + os.Setenv("LD_LIBRARY_PATH", val) +} + // Walk the sysfs nodes for the available GPUs and gather information from them // skipping over any devices in the skip map func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) { diff --git a/gpu/assets.go b/gpu/assets.go index 28364f83..539635ee 100644 --- a/gpu/assets.go +++ b/gpu/assets.go @@ -11,6 +11,7 @@ import ( "strings" "sync" "syscall" + "time" ) var ( @@ -84,7 +85,12 @@ func Cleanup() { slog.Debug("cleaning up", "dir", tmpDir) err := os.RemoveAll(tmpDir) if err != nil { - slog.Warn("failed to clean up", "dir", tmpDir, "err", err) + // On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove + time.Sleep(1000 * time.Millisecond) + err = os.RemoveAll(tmpDir) + if err != nil { + slog.Warn("failed to clean up", "dir", tmpDir, "err", err) + } } } } diff --git a/llm/dyn_ext_server.c b/llm/dyn_ext_server.c deleted file mode 100644 index dab49f85..00000000 --- a/llm/dyn_ext_server.c +++ /dev/null @@ -1,142 +0,0 @@ -#include "dyn_ext_server.h" - -#include -#include - -#ifdef __linux__ -#include -#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags) -#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym) -#define LOAD_ERR() strdup(dlerror()) -#define UNLOAD_LIBRARY(handle) dlclose(handle) -#elif _WIN32 -#include -#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib) -#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym) -#define UNLOAD_LIBRARY(handle) FreeLibrary(handle) -#define LOAD_ERR() ({\ - LPSTR messageBuffer = NULL; \ - size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \ - NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \ - char *resp = strdup(messageBuffer); \ - LocalFree(messageBuffer); \ - resp; \ -}) -#else -#include -#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags) -#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym) -#define LOAD_ERR() strdup(dlerror()) -#define UNLOAD_LIBRARY(handle) dlclose(handle) -#endif - -void dyn_init(const char *libPath, struct dynamic_llama_server *s, - ext_server_resp_t *err) { - int i = 0; - struct lookup { - char *s; - void **p; - } l[] = { - {"llama_server_init", (void *)&s->llama_server_init}, - {"llama_server_start", (void *)&s->llama_server_start}, - {"llama_server_stop", (void *)&s->llama_server_stop}, - {"llama_server_completion", (void *)&s->llama_server_completion}, - {"llama_server_completion_next_result", - (void *)&s->llama_server_completion_next_result}, - {"llama_server_completion_cancel", - (void *)&s->llama_server_completion_cancel}, - {"llama_server_release_task_result", - (void *)&s->llama_server_release_task_result}, - {"llama_server_tokenize", (void *)&s->llama_server_tokenize}, - {"llama_server_detokenize", (void *)&s->llama_server_detokenize}, - {"llama_server_embedding", (void *)&s->llama_server_embedding}, - {"llama_server_release_json_resp", - (void *)&s->llama_server_release_json_resp}, - {"", NULL}, - }; - - printf("loading library %s\n", libPath); - s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW); - if (!s->handle) { - err->id = -1; - char *msg = LOAD_ERR(); - snprintf(err->msg, err->msg_len, - "Unable to load dynamic server library: %s", msg); - free(msg); - return; - } - - for (i = 0; l[i].p != NULL; i++) { - *l[i].p = LOAD_SYMBOL(s->handle, l[i].s); - if (!l[i].p) { - UNLOAD_LIBRARY(s->handle); - err->id = -1; - char *msg = LOAD_ERR(); - snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s", - l[i].s, msg); - free(msg); - return; - } - } -} - -inline void dyn_llama_server_init(struct dynamic_llama_server s, - ext_server_params_t *sparams, - ext_server_resp_t *err) { - s.llama_server_init(sparams, err); -} - -inline void dyn_llama_server_start(struct dynamic_llama_server s) { - s.llama_server_start(); -} - -inline void dyn_llama_server_stop(struct dynamic_llama_server s) { - s.llama_server_stop(); -} - -inline void dyn_llama_server_completion(struct dynamic_llama_server s, - const char *json_req, - ext_server_resp_t *resp) { - s.llama_server_completion(json_req, resp); -} - -inline void dyn_llama_server_completion_next_result( - struct dynamic_llama_server s, const int task_id, - ext_server_task_result_t *result) { - s.llama_server_completion_next_result(task_id, result); -} - -inline void dyn_llama_server_completion_cancel( - struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) { - s.llama_server_completion_cancel(task_id, err); -} -inline void dyn_llama_server_release_task_result( - struct dynamic_llama_server s, ext_server_task_result_t *result) { - s.llama_server_release_task_result(result); -} - -inline void dyn_llama_server_tokenize(struct dynamic_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { - s.llama_server_tokenize(json_req, json_resp, err); -} - -inline void dyn_llama_server_detokenize(struct dynamic_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { - s.llama_server_detokenize(json_req, json_resp, err); -} - -inline void dyn_llama_server_embedding(struct dynamic_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { - s.llama_server_embedding(json_req, json_resp, err); -} - -inline void dyn_llama_server_release_json_resp( - struct dynamic_llama_server s, char **json_resp) { - s.llama_server_release_json_resp(json_resp); -} diff --git a/llm/dyn_ext_server.go b/llm/dyn_ext_server.go deleted file mode 100644 index 7bd2067d..00000000 --- a/llm/dyn_ext_server.go +++ /dev/null @@ -1,388 +0,0 @@ -package llm - -/* -#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server -#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds -#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE -#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE -#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG -#cgo darwin LDFLAGS: -lc++ -framework Accelerate -#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders -#cgo linux CFLAGS: -D_GNU_SOURCE -#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm -#cgo linux windows LDFLAGS: -lpthread - -#include -#include "dyn_ext_server.h" - -*/ -import "C" - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "log/slog" - "os" - "path/filepath" - "strings" - "sync" - "time" - "unsafe" - - "github.com/ollama/ollama/api" - "github.com/ollama/ollama/gpu" -) - -type dynExtServer struct { - s C.struct_dynamic_llama_server - options *api.Options -} - -// Note: current implementation does not support concurrent instantiations -var mutex sync.Mutex - -func newExtServerResp(len C.size_t) C.ext_server_resp_t { - var resp C.ext_server_resp_t - resp.msg_len = len - bytes := make([]byte, len) - resp.msg = (*C.char)(C.CBytes(bytes)) - return resp -} - -func freeExtServerResp(resp C.ext_server_resp_t) { - if resp.msg_len == 0 { - return - } - C.free(unsafe.Pointer(resp.msg)) -} - -func extServerResponseToErr(resp C.ext_server_resp_t) error { - return fmt.Errorf(C.GoString(resp.msg)) -} - -func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) { - if !mutex.TryLock() { - slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete") - mutex.Lock() - } - gpu.UpdatePath(filepath.Dir(library)) - libPath := C.CString(library) - defer C.free(unsafe.Pointer(libPath)) - resp := newExtServerResp(512) - defer freeExtServerResp(resp) - var srv C.struct_dynamic_llama_server - C.dyn_init(libPath, &srv, &resp) - if resp.id < 0 { - mutex.Unlock() - return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg)) - } - llm := dynExtServer{ - s: srv, - options: opts, - } - slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library)) - - var sparams C.ext_server_params_t - sparams.model = C.CString(model) - defer C.free(unsafe.Pointer(sparams.model)) - - sparams.embedding = true - sparams.n_ctx = C.uint(opts.NumCtx) - sparams.n_batch = C.uint(opts.NumBatch) - sparams.n_gpu_layers = C.int(opts.NumGPU) - sparams.main_gpu = C.int(opts.MainGPU) - sparams.n_parallel = 1 // TODO - wire up concurrency - - // Always use the value encoded in the model - sparams.rope_freq_base = 0.0 - sparams.rope_freq_scale = 0.0 - sparams.memory_f16 = C.bool(opts.F16KV) - sparams.use_mlock = C.bool(opts.UseMLock) - sparams.use_mmap = C.bool(opts.UseMMap) - - if opts.UseNUMA { - sparams.numa = C.int(1) - } else { - sparams.numa = C.int(0) - } - - sparams.lora_adapters = nil - for i := 0; i < len(adapters); i++ { - la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t)) - defer C.free(unsafe.Pointer(la)) - la.adapter = C.CString(adapters[i]) - defer C.free(unsafe.Pointer(la.adapter)) - la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX - la.next = nil - if i == 0 { - sparams.lora_adapters = la - } else { - tmp := sparams.lora_adapters - for ; tmp.next != nil; tmp = tmp.next { - } - tmp.next = la - } - } - - if len(projectors) > 0 { - // TODO: applying multiple projectors is not supported by the llama.cpp server yet - sparams.mmproj = C.CString(projectors[0]) - defer C.free(unsafe.Pointer(sparams.mmproj)) - } else { - sparams.mmproj = nil - } - - sparams.n_threads = C.uint(opts.NumThread) - - if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" { - sparams.verbose_logging = C.bool(true) - } else { - sparams.verbose_logging = C.bool(false) - } - - slog.Info("Initializing llama server") - slog.Debug(fmt.Sprintf("server params: %+v", sparams)) - initResp := newExtServerResp(512) - defer freeExtServerResp(initResp) - C.dyn_llama_server_init(llm.s, &sparams, &initResp) - if initResp.id < 0 { - mutex.Unlock() - err := extServerResponseToErr(initResp) - slog.Debug(fmt.Sprintf("failure during initialization: %s", err)) - return nil, err - } - - slog.Info("Starting llama main loop") - C.dyn_llama_server_start(llm.s) - return &llm, nil -} - -func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error { - resp := newExtServerResp(128) - defer freeExtServerResp(resp) - - if len(predict.Images) > 0 { - slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images))) - } - - request := map[string]any{ - "prompt": predict.Prompt, - "stream": true, - "n_predict": predict.Options.NumPredict, - "n_keep": predict.Options.NumKeep, - "temperature": predict.Options.Temperature, - "top_k": predict.Options.TopK, - "top_p": predict.Options.TopP, - "tfs_z": predict.Options.TFSZ, - "typical_p": predict.Options.TypicalP, - "repeat_last_n": predict.Options.RepeatLastN, - "repeat_penalty": predict.Options.RepeatPenalty, - "presence_penalty": predict.Options.PresencePenalty, - "frequency_penalty": predict.Options.FrequencyPenalty, - "mirostat": predict.Options.Mirostat, - "mirostat_tau": predict.Options.MirostatTau, - "mirostat_eta": predict.Options.MirostatEta, - "penalize_nl": predict.Options.PenalizeNewline, - "seed": predict.Options.Seed, - "stop": predict.Options.Stop, - "image_data": predict.Images, - "cache_prompt": true, - } - - if predict.Format == "json" { - request["grammar"] = jsonGrammar - if !strings.Contains(strings.ToLower(predict.Prompt), "json") { - slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.") - } - } - - retryDelay := 100 * time.Microsecond - for retries := 0; retries < maxRetries; retries++ { - if retries > 0 { - time.Sleep(retryDelay) // wait before retrying - retryDelay *= 2 // exponential backoff - } - - // Handling JSON marshaling with special characters unescaped. - buffer := &bytes.Buffer{} - enc := json.NewEncoder(buffer) - enc.SetEscapeHTML(false) - - if err := enc.Encode(request); err != nil { - return fmt.Errorf("failed to marshal data: %w", err) - } - - req := C.CString(buffer.String()) - defer C.free(unsafe.Pointer(req)) - - C.dyn_llama_server_completion(llm.s, req, &resp) - if resp.id < 0 { - return extServerResponseToErr(resp) - } - - retryNeeded := false - // keep track of the last token generated, this is used to abort if the model starts looping - var lastToken string - var tokenRepeat int - out: - for { - select { - case <-ctx.Done(): - return cancelCompletion(llm, resp) - default: - var result C.ext_server_task_result_t - C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result) - json_resp := C.GoString(result.json_resp) - C.dyn_llama_server_release_task_result(llm.s, &result) - - var p prediction - if err := json.Unmarshal([]byte(json_resp), &p); err != nil { - C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp) - if resp.id < 0 { - return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg)) - } else { - return fmt.Errorf("error unmarshaling llm prediction response: %w", err) - } - } - - if bool(result.error) && strings.Contains(json_resp, "slot unavailable") { - retryNeeded = true - // task will already be canceled - break out - } - - switch { - case strings.TrimSpace(p.Content) == lastToken: - tokenRepeat++ - default: - lastToken = strings.TrimSpace(p.Content) - tokenRepeat = 0 - } - - // 30 picked as an arbitrary max token repeat limit, modify as needed - if tokenRepeat > 30 { - slog.Debug("prediction aborted, token repeat limit reached") - return cancelCompletion(llm, resp) - } - - if p.Content != "" { - fn(PredictResult{ - Content: p.Content, - }) - } - - if p.Stop || bool(result.stop) { - fn(PredictResult{ - Done: true, - PromptEvalCount: p.Timings.PromptN, - PromptEvalDuration: parseDurationMs(p.Timings.PromptMS), - EvalCount: p.Timings.PredictedN, - EvalDuration: parseDurationMs(p.Timings.PredictedMS), - }) - return nil - } - } - } - if !retryNeeded { - return nil // success - } - } - - // should never reach here ideally - return fmt.Errorf("max retries exceeded") -} - -func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error { - C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp) - if resp.id < 0 { - return extServerResponseToErr(resp) - } else { - return nil - } -} - -func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { - data, err := json.Marshal(TokenizeRequest{Content: prompt}) - if err != nil { - return nil, fmt.Errorf("marshaling encode data: %w", err) - } - req := C.CString(string(data)) - defer C.free(unsafe.Pointer(req)) - var json_resp *C.char - resp := newExtServerResp(128) - defer freeExtServerResp(resp) - C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp) - if resp.id < 0 { - return nil, extServerResponseToErr(resp) - } - defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp) - - var encoded TokenizeResponse - if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil { - return nil, fmt.Errorf("unmarshal encode response: %w", err2) - } - - return encoded.Tokens, err -} - -func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) { - if len(tokens) == 0 { - return "", nil - } - data, err := json.Marshal(DetokenizeRequest{Tokens: tokens}) - if err != nil { - return "", fmt.Errorf("marshaling decode data: %w", err) - } - - req := C.CString(string(data)) - defer C.free(unsafe.Pointer(req)) - var json_resp *C.char - resp := newExtServerResp(128) - defer freeExtServerResp(resp) - C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp) - if resp.id < 0 { - return "", extServerResponseToErr(resp) - } - defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp) - - var decoded DetokenizeResponse - if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil { - return "", fmt.Errorf("unmarshal encode response: %w", err2) - } - - return decoded.Content, err -} - -func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) { - data, err := json.Marshal(TokenizeRequest{Content: input}) - if err != nil { - return nil, fmt.Errorf("error marshaling embed data: %w", err) - } - - req := C.CString(string(data)) - defer C.free(unsafe.Pointer(req)) - var json_resp *C.char - resp := newExtServerResp(128) - defer freeExtServerResp(resp) - C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp) - if resp.id < 0 { - return nil, extServerResponseToErr(resp) - } - defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp) - - var embedding EmbeddingResponse - if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil { - return nil, fmt.Errorf("unmarshal tokenize response: %w", err) - } - - return embedding.Embedding, nil -} - -func (llm *dynExtServer) Close() { - C.dyn_llama_server_stop(llm.s) - mutex.Unlock() -} diff --git a/llm/dyn_ext_server.h b/llm/dyn_ext_server.h deleted file mode 100644 index cddf4a1f..00000000 --- a/llm/dyn_ext_server.h +++ /dev/null @@ -1,74 +0,0 @@ -#include - -#include "ext_server.h" - -#ifdef __cplusplus -extern "C" { -#endif -struct dynamic_llama_server { - void *handle; - void (*llama_server_init)(ext_server_params_t *sparams, - ext_server_resp_t *err); - void (*llama_server_start)(); - void (*llama_server_stop)(); - void (*llama_server_completion)(const char *json_req, - ext_server_resp_t *resp); - void (*llama_server_completion_next_result)(const int task_id, - ext_server_task_result_t *result); - void (*llama_server_completion_cancel)(const int task_id, - ext_server_resp_t *err); - void (*llama_server_release_task_result)(ext_server_task_result_t *result); - void (*llama_server_tokenize)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_detokenize)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_embedding)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_release_json_resp)(char **json_resp); -}; - -void dyn_init(const char *libPath, struct dynamic_llama_server *s, - ext_server_resp_t *err); - -// No good way to call C function pointers from Go so inline the indirection -void dyn_llama_server_init(struct dynamic_llama_server s, - ext_server_params_t *sparams, - ext_server_resp_t *err); - -void dyn_llama_server_start(struct dynamic_llama_server s); - -void dyn_llama_server_stop(struct dynamic_llama_server s); - -void dyn_llama_server_completion(struct dynamic_llama_server s, - const char *json_req, - ext_server_resp_t *resp); - -void dyn_llama_server_completion_next_result( - struct dynamic_llama_server s, const int task_id, - ext_server_task_result_t *result); - -void dyn_llama_server_completion_cancel(struct dynamic_llama_server s, - const int task_id, - ext_server_resp_t *err); - -void dyn_llama_server_release_task_result( - struct dynamic_llama_server s, ext_server_task_result_t *result); - -void dyn_llama_server_tokenize(struct dynamic_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); - -void dyn_llama_server_detokenize(struct dynamic_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err); - -void dyn_llama_server_embedding(struct dynamic_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); -void dyn_llama_server_release_json_resp(struct dynamic_llama_server s, - char **json_resp); - -#ifdef __cplusplus -} -#endif \ No newline at end of file diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt index 3b995eb3..db7d52dc 100644 --- a/llm/ext_server/CMakeLists.txt +++ b/llm/ext_server/CMakeLists.txt @@ -1,21 +1,14 @@ -set(TARGET ext_server) +set(TARGET ollama_llama_server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) +install(TARGETS ${TARGET} RUNTIME) +target_compile_definitions(${TARGET} PRIVATE + SERVER_VERBOSE=$ +) +target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) if (WIN32) - add_library(${TARGET} SHARED ext_server.cpp ../llama.cpp/llama.cpp) -else() - add_library(${TARGET} STATIC ext_server.cpp ../llama.cpp/llama.cpp) + TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() -target_compile_features(${TARGET} PRIVATE cxx_std_11) -target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1) -target_link_libraries(${TARGET} PRIVATE ggml llava common ) -set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) -target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$) -install(TARGETS ext_server LIBRARY) - -if (CUDAToolkit_FOUND) - target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - if (WIN32) - target_link_libraries(${TARGET} PRIVATE nvml) - endif() -endif() \ No newline at end of file +target_compile_features(${TARGET} PRIVATE cxx_std_11) \ No newline at end of file diff --git a/llm/ext_server/README.md b/llm/ext_server/README.md deleted file mode 100644 index bfb0d4a6..00000000 --- a/llm/ext_server/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Extern C Server - -This directory contains a thin facade we layer on top of the Llama.cpp server to -expose `extern C` interfaces to access the functionality through direct API -calls in-process. The llama.cpp code uses compile time macros to configure GPU -type along with other settings. During the `go generate ./...` execution, the -build will generate one or more copies of the llama.cpp `extern C` server based -on what GPU libraries are detected to support multiple GPU types as well as CPU -only support. The Ollama go build then embeds these different servers to support -different GPUs and settings at runtime. - -If you are making changes to the code in this directory, make sure to disable -caching during your go build to ensure you pick up your changes. A typical -iteration cycle from the top of the source tree looks like: - -``` -go generate ./... && go build -a . -``` \ No newline at end of file diff --git a/llm/ext_server/ext_server.cpp b/llm/ext_server/ext_server.cpp deleted file mode 100644 index b2067059..00000000 --- a/llm/ext_server/ext_server.cpp +++ /dev/null @@ -1,377 +0,0 @@ -#include "ext_server.h" -#include - -// Necessary evil since the server types are not defined in a header -#include "server.cpp" - -// Low level API access to verify GPU access -#if defined(GGML_USE_CUBLAS) -#if defined(GGML_USE_HIPBLAS) -#include -#include -#include -#ifdef __HIP_PLATFORM_AMD__ -// for rocblas_initialize() -#include "rocblas/rocblas.h" -#endif // __HIP_PLATFORM_AMD__ -#define cudaGetDevice hipGetDevice -#define cudaError_t hipError_t -#define cudaSuccess hipSuccess -#define cudaGetErrorString hipGetErrorString -#else -#include -#include -#include -#endif // defined(GGML_USE_HIPBLAS) -#endif // GGML_USE_CUBLAS - -// Expose the llama server as a callable extern "C" API -llama_server_context *llama = NULL; -std::thread ext_server_thread; -bool shutting_down = false; -std::atomic_int recv_counter; - -// RAII wrapper for tracking in-flight recv calls -class atomicRecv { - public: - atomicRecv(std::atomic &atomic) : atomic(atomic) { - ++this->atomic; - } - ~atomicRecv() { - --this->atomic; - } - private: - std::atomic &atomic; -}; - -void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) { - recv_counter = 0; - assert(err != NULL && sparams != NULL); - log_set_target(stderr); - if (!sparams->verbose_logging) { - server_verbose = true; - log_disable(); - } - - LOG_TEE("system info: %s\n", llama_print_system_info()); - err->id = 0; - err->msg[0] = '\0'; - try { - llama = new llama_server_context; - gpt_params params; - params.n_ctx = sparams->n_ctx; - params.n_batch = sparams->n_batch; - if (sparams->n_threads > 0) { - params.n_threads = sparams->n_threads; - } - params.n_parallel = sparams->n_parallel; - params.rope_freq_base = sparams->rope_freq_base; - params.rope_freq_scale = sparams->rope_freq_scale; - - if (sparams->memory_f16) { - params.cache_type_k = "f16"; - params.cache_type_v = "f16"; - } else { - params.cache_type_k = "f32"; - params.cache_type_v = "f32"; - } - - params.n_gpu_layers = sparams->n_gpu_layers; - params.main_gpu = sparams->main_gpu; - params.use_mlock = sparams->use_mlock; - params.use_mmap = sparams->use_mmap; - params.numa = (ggml_numa_strategy)sparams->numa; - params.embedding = sparams->embedding; - if (sparams->model != NULL) { - params.model = sparams->model; - } - - if (sparams->lora_adapters != NULL) { - for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; - la = la->next) { - params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale)); - } - - params.use_mmap = false; - } - - if (sparams->mmproj != NULL) { - params.mmproj = std::string(sparams->mmproj); - } - -#if defined(GGML_USE_CUBLAS) - // Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible - LOG_TEE("Performing pre-initialization of GPU\n"); - int id; - cudaError_t cudaErr = cudaGetDevice(&id); - if (cudaErr != cudaSuccess) { - err->id = -1; - snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr)); - return; - } -#endif - - llama_backend_init(); - llama_numa_init(params.numa); - - if (!llama->load_model(params)) { - // an error occurred that was not thrown - err->id = -1; - snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str()); - return; - } - - llama->initialize(); - } catch (std::exception &e) { - err->id = -1; - snprintf(err->msg, err->msg_len, "exception %s", e.what()); - } catch (...) { - err->id = -1; - snprintf(err->msg, err->msg_len, - "Unknown exception initializing llama server"); - } -} - -void llama_server_start() { - assert(llama != NULL); - // TODO mutex to protect thread creation - ext_server_thread = std::thread([&]() { - try { - LOG_TEE("llama server main loop starting\n"); - ggml_time_init(); - llama->queue_tasks.on_new_task(std::bind( - &llama_server_context::process_single_task, llama, std::placeholders::_1)); - llama->queue_tasks.on_finish_multitask(std::bind( - &llama_server_context::on_finish_multitask, llama, std::placeholders::_1)); - llama->queue_tasks.on_run_slots(std::bind( - &llama_server_context::update_slots, llama)); - llama->queue_results.on_multitask_update(std::bind( - &llama_server_queue::update_multitask, - &llama->queue_tasks, - std::placeholders::_1, - std::placeholders::_2, - std::placeholders::_3 - )); - llama->queue_tasks.start_loop(); - } catch (std::exception &e) { - LOG_TEE("caught exception in llama server main loop: %s\n", e.what()); - } catch (...) { - LOG_TEE("caught unknown exception in llama server main loop\n"); - } - LOG_TEE("\nllama server shutting down\n"); - llama_backend_free(); - }); -} - -void llama_server_stop() { - assert(llama != NULL); - // Shutdown any in-flight requests and block incoming requests. - LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n"); - shutting_down = true; - - while (recv_counter.load() > 0) { - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - } - - // This may take a while for any pending tasks to drain - // TODO - consider a timeout to cancel tasks if it's taking too long - llama->queue_tasks.terminate(); - ext_server_thread.join(); - delete llama; - llama = NULL; - LOG_TEE("llama server shutdown complete\n"); - shutting_down = false; -} - -void llama_server_completion(const char *json_req, ext_server_resp_t *resp) { - assert(llama != NULL && json_req != NULL && resp != NULL); - resp->id = -1; - resp->msg[0] = '\0'; - try { - if (shutting_down) { - throw std::runtime_error("server shutting down"); - } - json data = json::parse(json_req); - resp->id = llama->queue_tasks.get_new_id(); - llama->queue_results.add_waiting_task_id(resp->id); - llama->request_completion(resp->id, data, false, false, -1); - } catch (std::exception &e) { - snprintf(resp->msg, resp->msg_len, "exception %s", e.what()); - } catch (...) { - snprintf(resp->msg, resp->msg_len, "Unknown exception during completion"); - } -} - -void llama_server_completion_next_result(const int task_id, - ext_server_task_result_t *resp) { - assert(llama != NULL && resp != NULL); - resp->id = -1; - resp->stop = false; - resp->error = false; - resp->json_resp = NULL; - std::string result_json; - try { - atomicRecv ar(recv_counter); - task_result result = llama->queue_results.recv(task_id); - result_json = - result.result_json.dump(-1, ' ', false, json::error_handler_t::replace); - resp->id = result.id; - resp->stop = result.stop; - resp->error = result.error; - if (result.error) { - LOG_TEE("next result cancel on error\n"); - llama->request_cancel(task_id); - LOG_TEE("next result removing waiting tak ID: %d\n", task_id); - llama->queue_results.remove_waiting_task_id(task_id); - } else if (result.stop) { - LOG_TEE("next result cancel on stop\n"); - llama->request_cancel(task_id); - LOG_TEE("next result removing waiting task ID: %d\n", task_id); - llama->queue_results.remove_waiting_task_id(task_id); - } else if (shutting_down) { - LOG_TEE("aborting completion due to shutdown %d\n", task_id); - llama->request_cancel(task_id); - llama->queue_results.remove_waiting_task_id(task_id); - resp->stop = true; - } - } catch (std::exception &e) { - resp->error = true; - resp->id = -1; - result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}"; - LOG_TEE("llama server completion exception %s\n", e.what()); - } catch (...) { - resp->error = true; - resp->id = -1; - result_json = "{\"error\":\"Unknown exception during completion\"}"; - LOG_TEE("llama server completion unknown exception\n"); - } - const std::string::size_type size = result_json.size() + 1; - resp->json_resp = new char[size]; - snprintf(resp->json_resp, size, "%s", result_json.c_str()); -} - -void llama_server_release_task_result(ext_server_task_result_t *result) { - if (result == NULL || result->json_resp == NULL) { - return; - } - delete[] result->json_resp; -} - -void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) { - assert(llama != NULL && err != NULL); - err->id = 0; - err->msg[0] = '\0'; - try { - llama->request_cancel(task_id); - llama->queue_results.remove_waiting_task_id(task_id); - } catch (std::exception &e) { - err->id = -1; - snprintf(err->msg, err->msg_len, "exception %s", e.what()); - } catch (...) { - err->id = -1; - snprintf(err->msg, err->msg_len, - "Unknown exception completion cancel in llama server"); - } -} - -void llama_server_tokenize(const char *json_req, char **json_resp, - ext_server_resp_t *err) { - assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL); - *json_resp = NULL; - err->id = 0; - err->msg[0] = '\0'; - try { - if (shutting_down) { - throw std::runtime_error("server shutting down"); - } - const json body = json::parse(json_req); - std::vector tokens; - if (body.count("content") != 0) { - tokens = llama->tokenize(body["content"], false); - } - const json data = format_tokenizer_response(tokens); - std::string result_json = data.dump(); - const std::string::size_type size = result_json.size() + 1; - *json_resp = new char[size]; - snprintf(*json_resp, size, "%s", result_json.c_str()); - } catch (std::exception &e) { - err->id = -1; - snprintf(err->msg, err->msg_len, "exception %s", e.what()); - } catch (...) { - err->id = -1; - snprintf(err->msg, err->msg_len, "Unknown exception during tokenize"); - } -} - -void llama_server_release_json_resp(char **json_resp) { - if (json_resp == NULL || *json_resp == NULL) { - return; - } - delete[] *json_resp; -} - -void llama_server_detokenize(const char *json_req, char **json_resp, - ext_server_resp_t *err) { - assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL); - *json_resp = NULL; - err->id = 0; - err->msg[0] = '\0'; - try { - if (shutting_down) { - throw std::runtime_error("server shutting down"); - } - const json body = json::parse(json_req); - std::string content; - if (body.count("tokens") != 0) { - const std::vector tokens = body["tokens"]; - content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend()); - } - const json data = format_detokenized_response(content); - std::string result_json = data.dump(); - const std::string::size_type size = result_json.size() + 1; - *json_resp = new char[size]; - snprintf(*json_resp, size, "%s", result_json.c_str()); - } catch (std::exception &e) { - err->id = -1; - snprintf(err->msg, err->msg_len, "exception %s", e.what()); - } catch (...) { - err->id = -1; - snprintf(err->msg, err->msg_len, "Unknown exception during detokenize"); - } -} - -void llama_server_embedding(const char *json_req, char **json_resp, - ext_server_resp_t *err) { - assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL); - *json_resp = NULL; - err->id = 0; - err->msg[0] = '\0'; - try { - if (shutting_down) { - throw std::runtime_error("server shutting down"); - } - const json body = json::parse(json_req); - json prompt; - if (body.count("content") != 0) { - prompt = body["content"]; - } else { - prompt = ""; - } - const int task_id = llama->queue_tasks.get_new_id(); - llama->queue_results.add_waiting_task_id(task_id); - llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1); - atomicRecv ar(recv_counter); - task_result result = llama->queue_results.recv(task_id); - std::string result_json = result.result_json.dump(); - const std::string::size_type size = result_json.size() + 1; - *json_resp = new char[size]; - snprintf(*json_resp, size, "%s", result_json.c_str()); - llama->queue_results.remove_waiting_task_id(task_id); - } catch (std::exception &e) { - err->id = -1; - snprintf(err->msg, err->msg_len, "exception %s", e.what()); - } catch (...) { - err->id = -1; - snprintf(err->msg, err->msg_len, "Unknown exception during embedding"); - } -} \ No newline at end of file diff --git a/llm/ext_server/ext_server.h b/llm/ext_server/ext_server.h deleted file mode 100644 index 9b9ce2ec..00000000 --- a/llm/ext_server/ext_server.h +++ /dev/null @@ -1,95 +0,0 @@ -#if defined(LLAMA_SERVER_LIBRARY) -#ifndef LLAMA_SERVER_H -#define LLAMA_SERVER_H -#include -#include -#include -#include - -int __main(int argc, char **argv); - -// This exposes extern C entrypoints into the llama_server -// To enable the server compile with LLAMA_SERVER_LIBRARY - -#ifdef __cplusplus -extern "C" { -#endif -typedef struct ext_server_resp { - int id; // < 0 on error - size_t msg_len; // caller must allocate msg and set msg_len - char *msg; -} ext_server_resp_t; - -// Allocated and freed by caller -typedef struct ext_server_lora_adapter { - char *adapter; - float scale; - struct ext_server_lora_adapter *next; -} ext_server_lora_adapter_t; - -// Allocated and freed by caller -typedef struct ext_server_params { - char *model; - uint32_t n_ctx; // token context window, 0 = from model - uint32_t n_batch; // prompt processing maximum batch size - uint32_t n_threads; // number of threads to use for generation - int32_t n_parallel; // number of parallel sequences to decodewra - float rope_freq_base; // RoPE base frequency, 0 = from model - float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model - bool memory_f16; // use f16 instead of f32 for memory kv - int32_t n_gpu_layers; // number of layers to store in VRAM (-1 - use default) - int32_t main_gpu; // the GPU that is used for scratch and small tensors - bool use_mlock; // force system to keep model in RAM - bool use_mmap; // use mmap if possible - int numa; // attempt optimizations that help on some NUMA systems - bool embedding; // get only sentence embedding - ext_server_lora_adapter_t *lora_adapters; - char *mmproj; - bool verbose_logging; // Enable verbose logging of the server -} ext_server_params_t; - -typedef struct ext_server_task_result { - int id; - bool stop; - bool error; - char *json_resp; // null terminated, memory managed by ext_server -} ext_server_task_result_t; - -// Initialize the server once per process -// err->id = 0 for success and err->msg[0] = NULL -// err->id != 0 for failure, and err->msg contains error message -void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err); - -// Run the main loop, called once per init -void llama_server_start(); -// Stop the main loop and free up resources allocated in init and start. Init -// must be called again to reuse -void llama_server_stop(); - -// json_req null terminated string, memory managed by caller -// resp->id >= 0 on success (task ID) -// resp->id < 0 on error, and resp->msg contains error message -void llama_server_completion(const char *json_req, ext_server_resp_t *resp); - -// Caller must call llama_server_release_task_result to free resp->json_resp -void llama_server_completion_next_result(const int task_id, - ext_server_task_result_t *result); -void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err); -void llama_server_release_task_result(ext_server_task_result_t *result); - -// Caller must call llama_server_releaes_json_resp to free json_resp if err.id < -// 0 -void llama_server_tokenize(const char *json_req, char **json_resp, - ext_server_resp_t *err); -void llama_server_detokenize(const char *json_req, char **json_resp, - ext_server_resp_t *err); -void llama_server_embedding(const char *json_req, char **json_resp, - ext_server_resp_t *err); -void llama_server_release_json_resp(char **json_resp); - -#ifdef __cplusplus -} -#endif - -#endif -#endif // LLAMA_SERVER_LIBRARY \ No newline at end of file diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 80fc8fe2..5df5bb47 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -2768,7 +2768,7 @@ inline void signal_handler(int signal) { shutdown_handler(signal); } -int _main(int argc, char **argv) +int main(int argc, char **argv) { #if SERVER_VERBOSE != 1 log_disable(); diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index 1186a06b..16ff710a 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -14,7 +14,7 @@ init_vars() { LLAMACPP_DIR=../llama.cpp CMAKE_DEFS="" - CMAKE_TARGETS="--target ext_server" + CMAKE_TARGETS="--target ollama_llama_server" if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}" else @@ -81,27 +81,24 @@ apply_patches() { build() { cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS} cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 - mkdir -p ${BUILD_DIR}/lib/ - ls ${BUILD_DIR} - g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \ - ${GCC_ARCH} \ - ${WHOLE_ARCHIVE} ${BUILD_DIR}/ext_server/libext_server.a ${NO_WHOLE_ARCHIVE} \ - ${BUILD_DIR}/common/libcommon.a \ - ${BUILD_DIR}/libllama.a \ - -Wl,-rpath,\$ORIGIN \ - -lpthread -ldl -lm \ - ${EXTRA_LIBS} } -compress_libs() { +compress() { echo "Compressing payloads to reduce overall binary size..." pids="" - rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz - for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do - gzip -n --best -f ${lib} & + rm -rf ${BUILD_DIR}/bin/*.gz + for f in ${BUILD_DIR}/bin/* ; do + gzip -n --best -f ${f} & pids+=" $!" done - echo + # check for lib directory + if [ -d ${BUILD_DIR}/lib ]; then + for f in ${BUILD_DIR}/lib/* ; do + gzip -n --best -f ${f} & + pids+=" $!" + done + fi + echo for pid in ${pids}; do wait $pid done diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 59bdc801..1fb84181 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -18,21 +18,31 @@ sign() { fi } -COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin" +COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on" case "${GOARCH}" in "amd64") COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off" + # Static build for linking into the Go binary + init_vars + CMAKE_TARGETS="--target llama --target ggml" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + BUILD_DIR="../build/darwin/${ARCH}_static" + echo "Building static library" + build + + # # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # + init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu" + BUILD_DIR="../build/darwin/${ARCH}/cpu" echo "Building LCD CPU" build - sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib - compress_libs + sign ${BUILD_DIR}/lib/libext_server.dylib + compress # # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance @@ -40,11 +50,11 @@ case "${GOARCH}" in # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx" + BUILD_DIR="../build/darwin/${ARCH}/cpu_avx" echo "Building AVX CPU" build - sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib - compress_libs + sign ${BUILD_DIR}/lib/libext_server.dylib + compress # # ~2013 CPU Dynamic library @@ -52,20 +62,30 @@ case "${GOARCH}" in # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2" + BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2" echo "Building AVX2 CPU" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation" build - sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib - compress_libs + sign ${BUILD_DIR}/lib/libext_server.dylib + compress ;; "arm64") + + # Static build for linking into the Go binary + init_vars + CMAKE_TARGETS="--target llama --target ggml" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + BUILD_DIR="../build/darwin/${ARCH}_static" + echo "Building static library" + build + + init_vars CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal" + BUILD_DIR="../build/darwin/${ARCH}/metal" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" build - sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib - compress_libs + sign ${BUILD_DIR}/lib/libext_server.dylib + compress ;; *) echo "GOARCH must be set" @@ -75,3 +95,4 @@ case "${GOARCH}" in esac cleanup +echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 67f1d6e6..941cb0be 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -57,16 +57,31 @@ init_vars git_module_setup apply_patches + +init_vars if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then + + if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then + # Static build for linking into the Go binary + init_vars + CMAKE_TARGETS="--target llama --target ggml" + CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + BUILD_DIR="../build/linux/${ARCH}_static" + echo "Building static library" + build + fi + + # Users building from source can tune the exact flags we pass to cmake for configuring # llama.cpp, and we'll build only 1 CPU variant in that case as the default. if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then + init_vars echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu" + BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building custom CPU" build - compress_libs + compress else # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer @@ -83,11 +98,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # + init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu" + BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building LCD CPU" build - compress_libs + compress fi if [ "${ARCH}" == "x86_64" ]; then @@ -101,10 +117,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx" + BUILD_DIR="../build/linux/${ARCH}/cpu_avx" echo "Building AVX CPU" build - compress_libs + compress fi if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then @@ -114,10 +130,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2" + BUILD_DIR="../build/linux/${ARCH}/cpu_avx2" echo "Building AVX2 CPU" build - compress_libs + compress fi fi fi @@ -157,7 +173,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off" fi CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}" + BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" build @@ -165,20 +181,20 @@ if [ -d "${CUDA_LIB_DIR}" ]; then # # TODO - in the future we may shift to packaging these separately and conditionally # downloading them in the install script. - DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )" + DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )" for lib in libcudart.so libcublas.so libcublasLt.so ; do DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true) if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then - cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/" + cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/" elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then - cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/" + cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/" elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then - cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/" + cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/" else - cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/" + cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/" fi done - compress_libs + compress fi @@ -201,23 +217,24 @@ if [ -d "${ROCM_PATH}" ]; then fi init_vars CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}" + BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" build # Record the ROCM dependencies - rm -f "${BUILD_DIR}/lib/deps.txt" - touch "${BUILD_DIR}/lib/deps.txt" - for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do - echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt" + rm -f "${BUILD_DIR}/bin/deps.txt" + touch "${BUILD_DIR}/bin/deps.txt" + for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do + echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt" done # bomb out if for some reason we didn't get a few deps - if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then - cat "${BUILD_DIR}/lib/deps.txt" + if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then + cat "${BUILD_DIR}/bin/deps.txt" echo "ERROR: deps file short" exit 1 fi - compress_libs + compress fi cleanup +echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 2ba0db89..f8f2997d 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -33,7 +33,7 @@ function init_vars { "-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off" ) - $script:cmakeTargets = @("ext_server") + $script:cmakeTargets = @("ollama_llama_server") $script:ARCH = "amd64" # arm not yet supported. if ($env:CGO_CFLAGS -contains "-g") { $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo") @@ -97,16 +97,14 @@ function apply_patches { } # Checkout each file - Set-Location -Path ${script:llamacppDir} foreach ($file in $filePaths) { - git checkout $file + git -C "${script:llamacppDir}" checkout $file } } # Apply each patch foreach ($patch in $patches) { - Set-Location -Path ${script:llamacppDir} - git apply $patch.FullName + git -C "${script:llamacppDir}" apply $patch.FullName } } @@ -115,41 +113,41 @@ function build { & cmake --version & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })" + write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })" & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} -} - -function install { - rm -ea 0 -recurse -force -path "${script:buildDir}/lib" - md "${script:buildDir}/lib" -ea 0 > $null - cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib" - cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib" - # Display the dll dependencies in the build log - if ($script:DUMPBIN -ne $null) { - & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll" + # Rearrange output to be consistent between different generators + if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) { + mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/" + remove-item "${script:buildDir}/bin/${script:config}" } } function sign { if ("${env:KEY_CONTAINER}") { - write-host "Signing ${script:buildDir}/lib/*.dll" - foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){ - & "${script:SignTool}" sign /v /debug /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` + write-host "Signing ${script:buildDir}/bin/*.exe ${script:buildDir}/bin/*.dll" + foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){ + & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } } } -function compress_libs { +function compress { if ($script:GZIP -eq $null) { write-host "gzip not installed, not compressing files" return } + write-host "Compressing binaries..." + $binaries = dir "${script:buildDir}/bin/*.exe" + foreach ($file in $binaries) { + & "$script:GZIP" --best -f $file + } + write-host "Compressing dlls..." - $libs = dir "${script:buildDir}/lib/*.dll" - foreach ($file in $libs) { + $binaries = dir "${script:buildDir}/bin/*.dll" + foreach ($file in $dlls) { & "$script:GZIP" --best -f $file } } @@ -164,14 +162,11 @@ function cleanup { } # Checkout each file - Set-Location -Path ${script:llamacppDir} foreach ($file in $filePaths) { - git checkout $file + git -C "${script:llamacppDir}" checkout $file } + git -C "${script:llamacppDir}" checkout CMakeLists.txt } - Set-Location "${script:llamacppDir}/" - git checkout CMakeLists.txt - } init_vars @@ -179,7 +174,6 @@ git_module_setup apply_patches # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer -# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX) # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver @@ -187,32 +181,46 @@ $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on") if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) { +# GCC build for direct linking into the Go binary +init_vars +$script:cmakeTargets = @("llama", "ggml") +$script:cmakeDefs = @( + "-G", "MinGW Makefiles" + "-DBUILD_SHARED_LIBS=off", + "-DLLAMA_NATIVE=off", + "-DLLAMA_AVX=off", + "-DLLAMA_AVX2=off", + "-DLLAMA_AVX512=off", + "-DLLAMA_F16C=off", + "-DLLAMA_FMA=off") +$script:buildDir="../build/windows/${script:ARCH}_static" +write-host "Building static library" +build + +# remaining llama.cpp builds use MSVC init_vars $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs - $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu" + $script:buildDir="../build/windows/${script:ARCH}/cpu" write-host "Building LCD CPU" build - install sign - compress_libs + compress init_vars $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs - $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx" + $script:buildDir="../build/windows/${script:ARCH}/cpu_avx" write-host "Building AVX CPU" build - install sign - compress_libs + compress init_vars $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs - $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2" + $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2" write-host "Building AVX2 CPU" build - install sign - compress_libs + compress } else { write-host "Skipping CPU generation step as requested" } @@ -225,13 +233,11 @@ if ($null -ne $script:CUDA_LIB_DIR) { $script:CUDA_VARIANT="_"+$script:CUDA_VERSION } init_vars - $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT" + $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT" $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}") - write-host "Building CUDA" build - install sign - compress_libs + compress } if ($null -ne $env:HIP_PATH) { @@ -241,7 +247,7 @@ if ($null -ne $env:HIP_PATH) { } init_vars - $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT" + $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT" $script:cmakeDefs += @( "-G", "Ninja", "-DCMAKE_C_COMPILER=clang.exe", @@ -264,13 +270,13 @@ if ($null -ne $env:HIP_PATH) { build # Ninja doesn't prefix with config name ${script:config}="" - install if ($null -ne $script:DUMPBIN) { - & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll" + & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll" } sign - compress_libs + compress } + cleanup -write-host "`ngo generate completed. LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\llama.cpp\build\windows\${script:ARCH})" +write-host "`ngo generate completed. LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\build\windows\${script:ARCH})" diff --git a/llm/generate/generate_darwin.go b/llm/generate/generate_darwin.go index 322879e9..77685234 100644 --- a/llm/generate/generate_darwin.go +++ b/llm/generate/generate_darwin.go @@ -1,3 +1,3 @@ package generate -//go:generate sh ./gen_darwin.sh +//go:generate bash ./gen_darwin.sh diff --git a/llm/llama.go b/llm/llama.go deleted file mode 100644 index cce9f484..00000000 --- a/llm/llama.go +++ /dev/null @@ -1,100 +0,0 @@ -package llm - -import ( - _ "embed" - "fmt" - "time" - - "github.com/ollama/ollama/api" -) - -const jsonGrammar = ` -root ::= object -value ::= object | array | string | number | ("true" | "false" | "null") ws - -object ::= - "{" ws ( - string ":" ws value - ("," ws string ":" ws value)* - )? "}" ws - -array ::= - "[" ws ( - value - ("," ws value)* - )? "]" ws - -string ::= - "\"" ( - [^"\\] | - "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes - )* "\"" ws - -number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws - -# Optional space: by convention, applied in this grammar after literal chars when allowed -ws ::= ([ \t\n] ws)? -` - -type ImageData struct { - Data []byte `json:"data"` - ID int `json:"id"` -} - -var payloadMissing = fmt.Errorf("expected dynamic library payloads not included in this build of ollama") - -type prediction struct { - Content string `json:"content"` - Model string `json:"model"` - Prompt string `json:"prompt"` - Stop bool `json:"stop"` - - Timings struct { - PredictedN int `json:"predicted_n"` - PredictedMS float64 `json:"predicted_ms"` - PromptN int `json:"prompt_n"` - PromptMS float64 `json:"prompt_ms"` - } -} - -const maxRetries = 3 - -type PredictOpts struct { - Prompt string - Format string - Images []ImageData - Options api.Options -} - -type PredictResult struct { - Content string - Done bool - PromptEvalCount int - PromptEvalDuration time.Duration - EvalCount int - EvalDuration time.Duration -} - -type TokenizeRequest struct { - Content string `json:"content"` -} - -type TokenizeResponse struct { - Tokens []int `json:"tokens"` -} - -type DetokenizeRequest struct { - Tokens []int `json:"tokens"` -} - -type DetokenizeResponse struct { - Content string `json:"content"` -} - -type EmbeddingRequest struct { - Content string `json:"content"` -} - -type EmbeddingResponse struct { - Embedding []float64 `json:"embedding"` -} diff --git a/llm/llm.go b/llm/llm.go index c0d2c6d3..52c53ad2 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -1,183 +1,15 @@ package llm -import ( - "context" - "fmt" - "log/slog" - "os" - "slices" - "strings" +// #cgo CFLAGS: -Illama.cpp +// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++ +// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++ +// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++ +// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++ +// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++ +// #include "llama.h" +import "C" - "github.com/ollama/ollama/api" - "github.com/ollama/ollama/format" - "github.com/ollama/ollama/gpu" -) - -type LLM interface { - Predict(context.Context, PredictOpts, func(PredictResult)) error - Embedding(context.Context, string) ([]float64, error) - Encode(context.Context, string) ([]int, error) - Decode(context.Context, []int) (string, error) - Close() -} - -var cpuOnlyFamilies = []string{ - "mamba", -} - -func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) { - if _, err := os.Stat(model); err != nil { - return nil, err - } - - f, err := os.Open(model) - if err != nil { - return nil, err - } - defer f.Close() - - ggml, _, err := DecodeGGML(f) - if err != nil { - return nil, err - } - - if opts.NumCtx > int(ggml.KV().ContextLength()) { - slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength()) - opts.NumCtx = int(ggml.KV().ContextLength()) - } - - if opts.NumCtx < 4 { - opts.NumCtx = 4 - } - - availableMemory, _ := gpu.CheckVRAM() - info := gpu.GetGPUInfo() - - usedMemory := info.MinimumMemory - for _, projector := range projectors { - usedMemory += projectorMemoryRequirements(projector) - - // multimodal models require at least 2048 context - opts.NumCtx = max(opts.NumCtx, 2048) - } - - // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv - kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV()) - - // this amount is the overhead + tensors in memory - // TODO: get this from the llama.cpp's graph calculations instead of - // estimating it's 1/6 * kv_cache_size * num_gqa - graph := int64(ggml.KV().GQA()) * kv / 6 - usedMemory += graph - - if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) { - info.Library = "cpu" - } - - requiredMemory := usedMemory - - var layers int - for i := 0; i < int(ggml.KV().BlockCount()); i++ { - layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount()) - requiredMemory += layerMemory - - if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) { - usedMemory += layerMemory - layers++ - } - } - - memOutputLayer := ggml.LayerSize("output.") - requiredMemory += memOutputLayer - - // only offload output layer if all repeating layers are offloaded - if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer { - usedMemory += memOutputLayer - layers++ - } - - slog.Info( - "offload to gpu", - "layers", layers, - "required", format.HumanBytes2(requiredMemory), - "used", format.HumanBytes2(usedMemory), - "available", format.HumanBytes2(availableMemory), - "kv", format.HumanBytes2(kv), - "graph", format.HumanBytes2(graph), - ) - - if opts.NumGPU < 0 && info.Library != "cpu" { - opts.NumGPU = layers - } - - return newLlmServer(info, model, adapters, projectors, opts) -} - -func projectorMemoryRequirements(filename string) int64 { - file, err := os.Open(filename) - if err != nil { - return 0 - } - defer file.Close() - - ggml, _, err := DecodeGGML(file) - if err != nil { - return 0 - } - - prefixes := make(map[string]struct{}) - for _, layer := range ggml.Tensors() { - parts := strings.Split(layer.Name, ".") - prefixes[strings.Join(parts[:2], ".")] = struct{}{} - } - - var ask int64 - for prefix := range prefixes { - ask += ggml.LayerSize(prefix) - } - - return ask -} - -// Give any native cgo implementations an opportunity to initialize -func Init() error { - return nativeInit() -} - -func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) { - dynLibs := getDynLibs(gpuInfo) - - // Check to see if the user has requested a specific library instead of auto-detecting - demandLib := os.Getenv("OLLAMA_LLM_LIBRARY") - if demandLib != "" { - libPath := availableDynLibs[demandLib] - if libPath == "" { - slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib)) - } else { - slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib)) - dynLibs = []string{libPath} - } - } - - // We stage into a temp directory, and if we've been idle for a while, it may have been reaped - _, err := os.Stat(dynLibs[0]) - if err != nil { - slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0])) - err = nativeInit() - if err != nil { - return nil, err - } - } - - err2 := fmt.Errorf("unable to locate suitable llm library") - for _, dynLib := range dynLibs { - srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts) - if err == nil { - return srv, nil - } - slog.Warn(fmt.Sprintf("Failed to load dynamic library %s %s", dynLib, err)) - err2 = err - } - - return nil, err2 +// SystemInfo is an unused example of calling llama.cpp functions using CGo +func SystemInfo() string { + return C.GoString(C.llama_print_system_info()) } diff --git a/llm/payload_linux.go b/llm/llm_darwin_amd64.go similarity index 56% rename from llm/payload_linux.go rename to llm/llm_darwin_amd64.go index 276705c7..3093e1ad 100644 --- a/llm/payload_linux.go +++ b/llm/llm_darwin_amd64.go @@ -4,5 +4,5 @@ import ( "embed" ) -//go:embed llama.cpp/build/linux/*/*/lib/* +//go:embed build/darwin/x86_64/*/bin/* var libEmbed embed.FS diff --git a/llm/payload_windows.go b/llm/llm_darwin_arm64.go similarity index 52% rename from llm/payload_windows.go rename to llm/llm_darwin_arm64.go index d195745a..928f0b82 100644 --- a/llm/payload_windows.go +++ b/llm/llm_darwin_arm64.go @@ -4,5 +4,5 @@ import ( "embed" ) -//go:embed llama.cpp/build/windows/*/*/lib/*.dll* +//go:embed build/darwin/arm64/*/bin/* var libEmbed embed.FS diff --git a/llm/llm_linux.go b/llm/llm_linux.go new file mode 100644 index 00000000..c2c5c4cb --- /dev/null +++ b/llm/llm_linux.go @@ -0,0 +1,6 @@ +package llm + +import "embed" + +//go:embed build/linux/*/*/bin/* +var libEmbed embed.FS diff --git a/llm/llm_windows.go b/llm/llm_windows.go new file mode 100644 index 00000000..17967b4e --- /dev/null +++ b/llm/llm_windows.go @@ -0,0 +1,6 @@ +package llm + +import "embed" + +//go:embed build/windows/*/*/bin/* +var libEmbed embed.FS diff --git a/llm/payload.go b/llm/payload.go new file mode 100644 index 00000000..8a134357 --- /dev/null +++ b/llm/payload.go @@ -0,0 +1,211 @@ +package llm + +import ( + "compress/gzip" + "errors" + "fmt" + "io" + "io/fs" + "log/slog" + "os" + "path/filepath" + "strings" + + "golang.org/x/exp/slices" + "golang.org/x/sync/errgroup" + + "github.com/ollama/ollama/gpu" +) + +var errPayloadMissing = fmt.Errorf("expected payloads not included in this build of ollama") + +func Init() error { + payloadsDir, err := gpu.PayloadsDir() + if err != nil { + return err + } + + slog.Info("extracting embedded files", "dir", payloadsDir) + binGlob := "build/*/*/*/bin/*" + + // extract server libraries + err = extractFiles(payloadsDir, binGlob) + if err != nil { + return fmt.Errorf("extract binaries: %v", err) + } + + var variants []string + for v := range availableServers() { + variants = append(variants, v) + } + slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants)) + slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY") + + return nil +} + +// binary names may contain an optional variant separated by '_' +// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2" +// Any library without a variant is the lowest common denominator +func availableServers() map[string]string { + payloadsDir, err := gpu.PayloadsDir() + if err != nil { + slog.Error("payload lookup error", "error", err) + return nil + } + + // glob payloadsDir for files that start with ollama_ + pattern := filepath.Join(payloadsDir, "*") + + files, err := filepath.Glob(pattern) + if err != nil { + slog.Debug("could not glob", "pattern", pattern, "error", err) + return nil + } + + servers := make(map[string]string) + for _, file := range files { + slog.Debug("availableServers : found", "file", file) + servers[filepath.Base(file)] = file + } + + return servers +} + +// serversForGpu returns a list of compatible servers give the provided GPU +// info, ordered by performance. assumes Init() has been called +// TODO - switch to metadata based mapping +func serversForGpu(info gpu.GpuInfo) []string { + // glob workDir for files that start with ollama_ + availableServers := availableServers() + requested := info.Library + if info.Variant != "" { + requested += "_" + info.Variant + } + + servers := []string{} + + // exact match first + for a := range availableServers { + if a == requested { + servers = []string{a} + + if a == "metal" { + return servers + } + + break + } + } + + alt := []string{} + + // Then for GPUs load alternates and sort the list for consistent load ordering + if info.Library != "cpu" { + for a := range availableServers { + if info.Library == strings.Split(a, "_")[0] && a != requested { + alt = append(alt, a) + } + } + + slices.Sort(alt) + servers = append(servers, alt...) + } + + // Load up the best CPU variant if not primary requested + if info.Library != "cpu" { + variant := gpu.GetCPUVariant() + // If no variant, then we fall back to default + // If we have a variant, try that if we find an exact match + // Attempting to run the wrong CPU instructions will panic the + // process + if variant != "" { + for cmp := range availableServers { + if cmp == "cpu_"+variant { + servers = append(servers, cmp) + break + } + } + } else { + servers = append(servers, "cpu") + } + } + + if len(servers) == 0 { + servers = []string{"cpu"} + } + + return servers +} + +// extract extracts the embedded files to the target directory +func extractFiles(targetDir string, glob string) error { + files, err := fs.Glob(libEmbed, glob) + if err != nil || len(files) == 0 { + return errPayloadMissing + } + + if err := os.MkdirAll(targetDir, 0o755); err != nil { + return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err) + } + + g := new(errgroup.Group) + + // build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE + for _, file := range files { + filename := file + + variant := filepath.Base(filepath.Dir(filepath.Dir(filename))) + + slog.Debug("extracting", "variant", variant, "file", filename) + + g.Go(func() error { + srcf, err := libEmbed.Open(filename) + if err != nil { + return err + } + defer srcf.Close() + + src := io.Reader(srcf) + if strings.HasSuffix(filename, ".gz") { + src, err = gzip.NewReader(src) + if err != nil { + return fmt.Errorf("decompress payload %s: %v", filename, err) + } + filename = strings.TrimSuffix(filename, ".gz") + } + + variantDir := filepath.Join(targetDir, variant) + if err := os.MkdirAll(variantDir, 0o755); err != nil { + return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err) + } + + base := filepath.Base(filename) + destFilename := filepath.Join(variantDir, base) + + _, err = os.Stat(destFilename) + switch { + case errors.Is(err, os.ErrNotExist): + destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) + if err != nil { + return fmt.Errorf("write payload %s: %v", filename, err) + } + defer destFile.Close() + if _, err := io.Copy(destFile, src); err != nil { + return fmt.Errorf("copy payload %s: %v", filename, err) + } + case err != nil: + return fmt.Errorf("stat payload %s: %v", filename, err) + } + return nil + }) + } + + err = g.Wait() + if err != nil { + // If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted + gpu.Cleanup() + return err + } + return nil +} diff --git a/llm/payload_common.go b/llm/payload_common.go deleted file mode 100644 index 45e45a2d..00000000 --- a/llm/payload_common.go +++ /dev/null @@ -1,233 +0,0 @@ -package llm - -import ( - "compress/gzip" - "errors" - "fmt" - "io" - "io/fs" - "log/slog" - "os" - "path/filepath" - "runtime" - "strings" - "sync" - - "golang.org/x/exp/slices" - "golang.org/x/sync/errgroup" - - "github.com/ollama/ollama/gpu" -) - -// Libraries names may contain an optional variant separated by '_' -// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2" -// Any library without a variant is the lowest common denominator -var availableDynLibs = map[string]string{} - -const pathComponentCount = 7 - -// getDynLibs returns an ordered list of LLM libraries to try, starting with the best -func getDynLibs(gpuInfo gpu.GpuInfo) []string { - // Short circuit if we know we're using the default built-in (darwin only) - if gpuInfo.Library == "default" { - return []string{"default"} - } - // TODO - temporary until we have multiple CPU variations for Darwin - // Short circuit on darwin with metal only - if len(availableDynLibs) == 1 { - if _, onlyMetal := availableDynLibs["metal"]; onlyMetal { - return []string{availableDynLibs["metal"]} - } - } - - exactMatch := "" - dynLibs := []string{} - altDynLibs := []string{} - requested := gpuInfo.Library - if gpuInfo.Variant != "" { - requested += "_" + gpuInfo.Variant - } - // Try to find an exact match - for cmp := range availableDynLibs { - if requested == cmp { - exactMatch = cmp - dynLibs = []string{availableDynLibs[cmp]} - break - } - } - // Then for GPUs load alternates and sort the list for consistent load ordering - if gpuInfo.Library != "cpu" { - for cmp := range availableDynLibs { - if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch { - altDynLibs = append(altDynLibs, cmp) - } - } - slices.Sort(altDynLibs) - for _, altDynLib := range altDynLibs { - dynLibs = append(dynLibs, availableDynLibs[altDynLib]) - } - } - - // Load up the best CPU variant if not primary requested - if gpuInfo.Library != "cpu" { - variant := gpu.GetCPUVariant() - // If no variant, then we fall back to default - // If we have a variant, try that if we find an exact match - // Attempting to run the wrong CPU instructions will panic the - // process - if variant != "" { - for cmp := range availableDynLibs { - if cmp == "cpu_"+variant { - dynLibs = append(dynLibs, availableDynLibs[cmp]) - break - } - } - } else { - dynLibs = append(dynLibs, availableDynLibs["cpu"]) - } - } - - // Finally, if we didn't find any matches, LCD CPU FTW - if len(dynLibs) == 0 { - dynLibs = []string{availableDynLibs["cpu"]} - } - slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs)) - return dynLibs -} - -func rocmDynLibPresent() bool { - for dynLibName := range availableDynLibs { - if strings.HasPrefix(dynLibName, "rocm") { - return true - } - } - return false -} - -func nativeInit() error { - payloadsDir, err := gpu.PayloadsDir() - if err != nil { - return err - } - - slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir)) - - libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*") - if err != nil { - if errors.Is(err, payloadMissing) { - slog.Info(fmt.Sprintf("%s", payloadMissing)) - return nil - } - return err - } - for _, lib := range libs { - // The last dir component is the variant name - variant := filepath.Base(filepath.Dir(lib)) - availableDynLibs[variant] = lib - } - - if err := verifyDriverAccess(); err != nil { - return err - } - - // Report which dynamic libraries we have loaded to assist troubleshooting - variants := make([]string, len(availableDynLibs)) - i := 0 - for variant := range availableDynLibs { - variants[i] = variant - i++ - } - slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants)) - slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY") - - return nil -} - -func extractDynamicLibs(payloadsDir, glob string) ([]string, error) { - files, err := fs.Glob(libEmbed, glob) - if err != nil || len(files) == 0 { - return nil, payloadMissing - } - - var mu sync.Mutex - var libs []string - var g errgroup.Group - for _, file := range files { - pathComps := strings.Split(file, "/") - if len(pathComps) != pathComponentCount { - slog.Error(fmt.Sprintf("unexpected payload components: %v", pathComps)) - continue - } - - file := file - g.Go(func() error { - // llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY - // Include the variant in the path to avoid conflicts between multiple server libs - targetDir := filepath.Join(payloadsDir, pathComps[pathComponentCount-3]) - srcFile, err := libEmbed.Open(file) - if err != nil { - return fmt.Errorf("read payload %s: %v", file, err) - } - defer srcFile.Close() - if err := os.MkdirAll(targetDir, 0o755); err != nil { - return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err) - } - src := io.Reader(srcFile) - filename := file - if strings.HasSuffix(file, ".gz") { - src, err = gzip.NewReader(src) - if err != nil { - return fmt.Errorf("decompress payload %s: %v", file, err) - } - filename = strings.TrimSuffix(filename, ".gz") - } - - destFile := filepath.Join(targetDir, filepath.Base(filename)) - if strings.Contains(destFile, "server") { - mu.Lock() - libs = append(libs, destFile) - mu.Unlock() - } - - destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) - if err != nil { - return fmt.Errorf("write payload %s: %v", file, err) - } - defer destFp.Close() - if _, err := io.Copy(destFp, src); err != nil { - return fmt.Errorf("copy payload %s: %v", file, err) - } - return nil - }) - } - err = g.Wait() - if err != nil { - // If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted - gpu.Cleanup() - return nil, err - } - return libs, nil -} - -func verifyDriverAccess() error { - if runtime.GOOS != "linux" { - return nil - } - // Only check ROCm access if we have the dynamic lib loaded - if rocmDynLibPresent() { - // Verify we have permissions - either running as root, or we have group access to the driver - fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666) - if err != nil { - if errors.Is(err, fs.ErrPermission) { - return fmt.Errorf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.") - } else if errors.Is(err, fs.ErrNotExist) { - // expected behavior without a radeon card - return nil - } - - return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) - } - fd.Close() - } - return nil -} diff --git a/llm/payload_darwin_amd64.go b/llm/payload_darwin_amd64.go deleted file mode 100644 index dfeeb9cf..00000000 --- a/llm/payload_darwin_amd64.go +++ /dev/null @@ -1,8 +0,0 @@ -package llm - -import ( - "embed" -) - -//go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib* -var libEmbed embed.FS diff --git a/llm/payload_darwin_arm64.go b/llm/payload_darwin_arm64.go deleted file mode 100644 index aa70c931..00000000 --- a/llm/payload_darwin_arm64.go +++ /dev/null @@ -1,8 +0,0 @@ -package llm - -import ( - "embed" -) - -//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib* -var libEmbed embed.FS diff --git a/llm/payload_test.go b/llm/payload_test.go deleted file mode 100644 index eb88d812..00000000 --- a/llm/payload_test.go +++ /dev/null @@ -1,58 +0,0 @@ -package llm - -import ( - "testing" - - "github.com/ollama/ollama/gpu" - "github.com/stretchr/testify/assert" -) - -func TestGetDynLibs(t *testing.T) { - availableDynLibs = map[string]string{ - "cpu": "X_cpu", - } - assert.Equal(t, false, rocmDynLibPresent()) - res := getDynLibs(gpu.GpuInfo{Library: "cpu"}) - assert.Len(t, res, 1) - assert.Equal(t, availableDynLibs["cpu"], res[0]) - - variant := gpu.GetCPUVariant() - if variant != "" { - variant = "_" + variant - } - availableDynLibs = map[string]string{ - "rocm_v5": "X_rocm_v5", - "rocm_v6": "X_rocm_v6", - "cpu" + variant: "X_cpu", - } - assert.Equal(t, true, rocmDynLibPresent()) - res = getDynLibs(gpu.GpuInfo{Library: "rocm"}) - assert.Len(t, res, 3) - assert.Equal(t, availableDynLibs["rocm_v5"], res[0]) - assert.Equal(t, availableDynLibs["rocm_v6"], res[1]) - assert.Equal(t, availableDynLibs["cpu"+variant], res[2]) - - res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"}) - assert.Len(t, res, 3) - assert.Equal(t, availableDynLibs["rocm_v6"], res[0]) - assert.Equal(t, availableDynLibs["rocm_v5"], res[1]) - assert.Equal(t, availableDynLibs["cpu"+variant], res[2]) - - res = getDynLibs(gpu.GpuInfo{Library: "cuda"}) - assert.Len(t, res, 1) - assert.Equal(t, availableDynLibs["cpu"+variant], res[0]) - - res = getDynLibs(gpu.GpuInfo{Library: "default"}) - assert.Len(t, res, 1) - assert.Equal(t, "default", res[0]) - - availableDynLibs = map[string]string{ - "rocm": "X_rocm_v5", - "cpu" + variant: "X_cpu", - } - assert.Equal(t, true, rocmDynLibPresent()) - res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"}) - assert.Len(t, res, 2) - assert.Equal(t, availableDynLibs["rocm"], res[0]) - assert.Equal(t, availableDynLibs["cpu"+variant], res[1]) -} diff --git a/llm/server.go b/llm/server.go new file mode 100644 index 00000000..d395cc7f --- /dev/null +++ b/llm/server.go @@ -0,0 +1,854 @@ +package llm + +import ( + "bufio" + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "log" + "log/slog" + "math/rand" + "net" + "net/http" + "os" + "os/exec" + "path/filepath" + "runtime" + "slices" + "strconv" + "strings" + "time" + + "github.com/ollama/ollama/api" + "github.com/ollama/ollama/format" + "github.com/ollama/ollama/gpu" +) + +// LlamaServer is an instance of the llama.cpp server +type LlamaServer struct { + port int + cmd *exec.Cmd + done chan error // Channel to signal when the process exits + status *StatusWriter + options *api.Options +} + +var cpuOnlyFamilies = []string{ + "mamba", +} + +func NewLlamaServer(model string, adapters, projectors []string, opts *api.Options) (*LlamaServer, error) { + if _, err := os.Stat(model); err != nil { + return nil, err + } + + f, err := os.Open(model) + if err != nil { + return nil, err + } + defer f.Close() + + ggml, _, err := DecodeGGML(f) + if err != nil { + return nil, err + } + + if opts.NumCtx > int(ggml.KV().ContextLength()) { + slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength()) + opts.NumCtx = int(ggml.KV().ContextLength()) + } + + if opts.NumCtx < 4 { + opts.NumCtx = 4 + } + + availableMemory, _ := gpu.CheckVRAM() + info := gpu.GetGPUInfo() + + usedMemory := info.MinimumMemory + for _, projector := range projectors { + usedMemory += projectorMemoryRequirements(projector) + + // multimodal models require at least 2048 context + opts.NumCtx = max(opts.NumCtx, 2048) + } + + // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv + kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV()) + + // this amount is the overhead + tensors in memory + // TODO: get this from the llama.cpp's graph calculations instead of + // estimating it's 1/6 * kv_cache_size * num_gqa + graph := int64(ggml.KV().GQA()) * kv / 6 + usedMemory += graph + + if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) { + info.Library = "cpu" + } + + requiredMemory := usedMemory + + var layers int + for i := 0; i < int(ggml.KV().BlockCount()); i++ { + layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount()) + requiredMemory += layerMemory + + if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) { + usedMemory += layerMemory + layers++ + } + } + + memOutputLayer := ggml.LayerSize("output.") + requiredMemory += memOutputLayer + + // only offload output layer if all repeating layers are offloaded + if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer { + usedMemory += memOutputLayer + layers++ + } + + slog.Info( + "offload to gpu", + "layers", layers, + "required", format.HumanBytes2(requiredMemory), + "used", format.HumanBytes2(usedMemory), + "available", format.HumanBytes2(availableMemory), + "kv", format.HumanBytes2(kv), + "graph", format.HumanBytes2(graph), + ) + + if opts.NumGPU < 0 && info.Library != "cpu" { + opts.NumGPU = layers + } + + if len(adapters) > 1 { + return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") + } + + availableServers := availableServers() + servers := serversForGpu(info) + + demandLib := os.Getenv("OLLAMA_LLM_LIBRARY") + if demandLib != "" { + serverPath := availableServers[demandLib] + if serverPath == "" { + slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib)) + } else { + slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath) + servers = []string{demandLib} + } + } + + if len(servers) == 0 { + return nil, fmt.Errorf("no servers found for %v", info) + } + + params := []string{ + "--model", model, + "--ctx-size", fmt.Sprintf("%d", opts.NumCtx), + "--batch-size", fmt.Sprintf("%d", opts.NumBatch), + "--embedding", + } + if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" { + params = append(params, "--log-format", "json") + } else { + params = append(params, "--log-disable") + } + + if opts.NumGPU > 0 { + params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU)) + } + + if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" { + params = append(params, "--verbose") + } + + if opts.MainGPU > 0 { + params = append(params, "--main-gpu", fmt.Sprintf("%d", opts.MainGPU)) + } + + if opts.RopeFrequencyBase > 0 { + params = append(params, "--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase)) + } + + if opts.RopeFrequencyScale > 0 { + params = append(params, "--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale)) + } + + if len(adapters) > 0 { + // TODO: applying multiple adapters is not supported by the llama.cpp server yet + params = append(params, "--lora", adapters[0]) + } + + if len(projectors) > 0 { + // TODO: applying multiple projectors is not supported by the llama.cpp server yet + params = append(params, "--mmproj", projectors[0]) + } + + if opts.NumThread > 0 { + params = append(params, "--threads", fmt.Sprintf("%d", opts.NumThread)) + } + + if !opts.F16KV { + params = append(params, "--memory-f32") + } + + if opts.UseMLock { + params = append(params, "--mlock") + } + + if !opts.UseMMap { + params = append(params, "--no-mmap") + } + + if opts.UseNUMA { + params = append(params, "--numa") + } + + // Loop through potential servers + var finalErr error + for i := 0; i < len(servers); i++ { + dir := availableServers[servers[i]] + + // Find an availableServers port, retry on each iterration in case the failure was a port conflict race + port := 0 + if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil { + var l *net.TCPListener + if l, err = net.ListenTCP("tcp", a); err == nil { + port = l.Addr().(*net.TCPAddr).Port + l.Close() + } + } + if port == 0 { + slog.Debug("ResolveTCPAddr failed ", "error", err) + port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range + } + finalParams := append(params, "--port", strconv.Itoa(port)) + + pathEnv := "LD_LIBRARY_PATH" + if runtime.GOOS == "windows" { + pathEnv = "PATH" + } + // append the server directory to LD_LIBRARY_PATH/PATH + libraryPaths := []string{dir} + if libraryPath, ok := os.LookupEnv(pathEnv); ok { + // Append our runner directory to the path + // This will favor system libraries over our bundled library dependencies + libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...) + } + + server := filepath.Join(dir, "ollama_llama_server") + if runtime.GOOS == "windows" { + server = server + ".exe" + } + + s := &LlamaServer{ + port: port, + cmd: exec.Command(server, finalParams...), + status: NewStatusWriter(os.Stderr), + options: opts, + } + libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator))) + slog.Debug(libEnv) + s.cmd.Env = append(os.Environ(), libEnv) + s.cmd.Stdout = os.Stdout + s.cmd.Stderr = s.status + + slog.Info("starting llama server", "cmd", s.cmd.String()) + + if err = s.cmd.Start(); err != nil { + msg := "" + if s.status != nil && s.status.LastErrMsg != "" { + msg = s.status.LastErrMsg + } + err = fmt.Errorf("error starting the external llama server: %v %s", err, msg) + finalErr = err + continue + } + + // reap subprocess when it exits + go func() { + // Exit status managed via getServerStatus + _ = s.cmd.Wait() + }() + + if err = s.waitUntilRunning(); err != nil { + slog.Error("error starting llama server", "server", servers[i], "error", err) + s.Close() + finalErr = err + continue + } + return s, nil + } + + slog.Error("unable to load any llama server", "error", finalErr) + return nil, finalErr +} + +func projectorMemoryRequirements(filename string) int64 { + file, err := os.Open(filename) + if err != nil { + return 0 + } + defer file.Close() + + ggml, _, err := DecodeGGML(file) + if err != nil { + return 0 + } + + prefixes := make(map[string]struct{}) + for _, layer := range ggml.Tensors() { + parts := strings.Split(layer.Name, ".") + prefixes[strings.Join(parts[:2], ".")] = struct{}{} + } + + var ask int64 + for prefix := range prefixes { + ask += ggml.LayerSize(prefix) + } + + return ask +} + +type ServerStatus int + +const ( // iota is reset to 0 + ServerStatusReady ServerStatus = iota + ServerStatusNoSlotsAvaialble + ServerStatusLoadingModel + ServerStatusNotResponding + ServerStatusError +) + +type ServerStatusResp struct { + Status string `json:"status"` + SlotsIdle int `json:"slots_idle"` + SlotsProcessing int `json:"slots_processing"` + Error string `json:"error"` +} + +func (s *LlamaServer) getServerStatus(ctx context.Context) (ServerStatus, error) { + // Fail fast if its exited + if s.cmd.ProcessState != nil { + msg := "" + if s.status != nil && s.status.LastErrMsg != "" { + msg = s.status.LastErrMsg + } + return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/health", s.port), nil) + if err != nil { + return ServerStatusError, fmt.Errorf("error creating GET request: %v", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + if errors.Is(err, context.DeadlineExceeded) { + return ServerStatusNotResponding, fmt.Errorf("server not responding") + } + return ServerStatusError, fmt.Errorf("health resp: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return ServerStatusError, fmt.Errorf("read health request: %w", err) + } + + var status ServerStatusResp + if err := json.Unmarshal(body, &status); err != nil { + return ServerStatusError, fmt.Errorf("health unmarshal encode response: %w", err) + } + + switch status.Status { + case "ok": + return ServerStatusReady, nil + case "no slot available": + return ServerStatusNoSlotsAvaialble, nil + case "loading model": + return ServerStatusLoadingModel, nil + default: + return ServerStatusError, fmt.Errorf("server error: %+v", status) + } +} + +func (s *LlamaServer) Ping(ctx context.Context) error { + _, err := s.getServerStatus(ctx) + if err != nil { + slog.Debug("server unhealthy", "error", err) + return err + } + return nil +} + +func (s *LlamaServer) waitUntilRunning() error { + start := time.Now() + expiresAt := time.Now().Add(3 * time.Minute) // be generous with timeout, large models can take a while to load + ticker := time.NewTicker(50 * time.Millisecond) + defer ticker.Stop() + + slog.Info("waiting for llama runner to start responding") + var lastStatus ServerStatus = -1 + for { + select { + case err := <-s.done: + msg := "" + if s.status != nil && s.status.LastErrMsg != "" { + msg = s.status.LastErrMsg + } + return fmt.Errorf("llama runner process has terminated: %v %s", err, msg) + case <-ticker.C: + if time.Now().After(expiresAt) { + // timeout + msg := "" + if s.status != nil && s.status.LastErrMsg != "" { + msg = s.status.LastErrMsg + } + return fmt.Errorf("timed out waiting for llama runner to start: %s", msg) + } + if s.cmd.ProcessState != nil { + msg := "" + if s.status != nil && s.status.LastErrMsg != "" { + msg = s.status.LastErrMsg + } + return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg) + } + + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel() + status, err := s.getServerStatus(ctx) + if err != nil && lastStatus != status { + slog.Debug("server not yet available", "error", err) + lastStatus = status + continue + } + + switch status { + case ServerStatusLoadingModel: + // TODO - this state never seems to happen with the current server.cpp code (bug?) + // it doesn't respond to the health endpoint until after the model is loaded + slog.Debug("loading model") + case ServerStatusReady: + slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds())) + return nil + } + } + } +} + +const jsonGrammar = ` +root ::= object +value ::= object | array | string | number | ("true" | "false" | "null") ws + +object ::= + "{" ws ( + string ":" ws value + ("," ws string ":" ws value)* + )? "}" ws + +array ::= + "[" ws ( + value + ("," ws value)* + )? "]" ws + +string ::= + "\"" ( + [^"\\] | + "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes + )* "\"" ws + +number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws + +# Optional space: by convention, applied in this grammar after literal chars when allowed +ws ::= ([ \t\n] ws)? +` + +const maxBufferSize = 512 * format.KiloByte +const maxRetries = 3 + +type ImageData struct { + Data []byte `json:"data"` + ID int `json:"id"` +} + +type completion struct { + Content string `json:"content"` + Model string `json:"model"` + Prompt string `json:"prompt"` + Stop bool `json:"stop"` + + Timings struct { + PredictedN int `json:"predicted_n"` + PredictedMS float64 `json:"predicted_ms"` + PromptN int `json:"prompt_n"` + PromptMS float64 `json:"prompt_ms"` + } +} + +type CompletionRequest struct { + Prompt string + Format string + Images []ImageData + Options api.Options +} + +type CompletionResponse struct { + Content string + Done bool + PromptEvalCount int + PromptEvalDuration time.Duration + EvalCount int + EvalDuration time.Duration +} + +func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error { + request := map[string]any{ + "prompt": req.Prompt, + "stream": true, + "n_predict": req.Options.NumPredict, + "n_keep": req.Options.NumKeep, + "main_gpu": req.Options.MainGPU, + "temperature": req.Options.Temperature, + "top_k": req.Options.TopK, + "top_p": req.Options.TopP, + "tfs_z": req.Options.TFSZ, + "typical_p": req.Options.TypicalP, + "repeat_last_n": req.Options.RepeatLastN, + "repeat_penalty": req.Options.RepeatPenalty, + "presence_penalty": req.Options.PresencePenalty, + "frequency_penalty": req.Options.FrequencyPenalty, + "mirostat": req.Options.Mirostat, + "mirostat_tau": req.Options.MirostatTau, + "mirostat_eta": req.Options.MirostatEta, + "penalize_nl": req.Options.PenalizeNewline, + "seed": req.Options.Seed, + "stop": req.Options.Stop, + "image_data": req.Images, + "cache_prompt": true, + } + + // Make sure the server is ready + status, err := s.getServerStatus(ctx) + if err != nil { + return err + } else if status != ServerStatusReady { + return fmt.Errorf("unexpected server status: %d", status) + } + + if req.Format == "json" { + request["grammar"] = jsonGrammar + if !strings.Contains(strings.ToLower(req.Prompt), "json") { + slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.") + } + } + + retryDelay := 100 * time.Microsecond + for retries := 0; retries < maxRetries; retries++ { + if retries > 0 { + time.Sleep(retryDelay) // wait before retrying + retryDelay *= 2 // exponential backoff + } + + // Handling JSON marshaling with special characters unescaped. + buffer := &bytes.Buffer{} + enc := json.NewEncoder(buffer) + enc.SetEscapeHTML(false) + + if err := enc.Encode(request); err != nil { + return fmt.Errorf("failed to marshal data: %v", err) + } + + endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer) + if err != nil { + return fmt.Errorf("error creating POST request: %v", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return fmt.Errorf("POST predict: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode >= 400 { + bodyBytes, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("failed reading llm error response: %w", err) + } + log.Printf("llm predict error: %s", bodyBytes) + return fmt.Errorf("%s", bodyBytes) + } + + scanner := bufio.NewScanner(resp.Body) + buf := make([]byte, 0, maxBufferSize) + scanner.Buffer(buf, maxBufferSize) + + retryNeeded := false + // keep track of the last token generated, this is used to abort if the model starts looping + var lastToken string + var tokenRepeat int + + for scanner.Scan() { + select { + case <-ctx.Done(): + // This handles the request cancellation + return ctx.Err() + default: + line := scanner.Bytes() + if len(line) == 0 { + continue + } + + // try again on slot unavailable + if bytes.Contains(line, []byte("slot unavailable")) { + retryNeeded = true + break + } + + evt, ok := bytes.CutPrefix(line, []byte("data: ")) + if !ok { + return fmt.Errorf("error parsing llm response stream: %s", line) + } + + var c completion + if err := json.Unmarshal(evt, &c); err != nil { + return fmt.Errorf("error unmarshaling llm prediction response: %v", err) + } + + switch { + case strings.TrimSpace(c.Content) == lastToken: + tokenRepeat++ + default: + lastToken = strings.TrimSpace(c.Content) + tokenRepeat = 0 + } + + // 30 picked as an arbitrary max token repeat limit, modify as needed + if tokenRepeat > 30 { + slog.Debug("prediction aborted, token repeat limit reached") + return ctx.Err() + } + + if c.Content != "" { + fn(CompletionResponse{ + Content: c.Content, + }) + } + + if c.Stop { + fn(CompletionResponse{ + Done: true, + PromptEvalCount: c.Timings.PromptN, + PromptEvalDuration: parseDurationMs(c.Timings.PromptMS), + EvalCount: c.Timings.PredictedN, + EvalDuration: parseDurationMs(c.Timings.PredictedMS), + }) + return nil + } + } + } + + if err := scanner.Err(); err != nil { + if strings.Contains(err.Error(), "unexpected EOF") { + s.Close() + msg := "" + if s.status != nil && s.status.LastErrMsg != "" { + msg = s.status.LastErrMsg + } + + return fmt.Errorf("an unknown error was encountered while running the model %s", msg) + } + return fmt.Errorf("error reading llm response: %v", err) + } + + if !retryNeeded { + return nil // success + } + } + + // should never reach here ideally + return fmt.Errorf("max retries exceeded") +} + +type EmbeddingRequest struct { + Content string `json:"content"` +} + +type EmbeddingResponse struct { + Embedding []float64 `json:"embedding"` +} + +func (s *LlamaServer) Embedding(ctx context.Context, prompt string) ([]float64, error) { + // Make sure the server is ready + status, err := s.getServerStatus(ctx) + if err != nil { + return nil, err + } else if status != ServerStatusReady { + return nil, fmt.Errorf("unexpected server status: %d", status) + } + + data, err := json.Marshal(TokenizeRequest{Content: prompt}) + if err != nil { + return nil, fmt.Errorf("error marshaling embed data: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data)) + if err != nil { + return nil, fmt.Errorf("error creating embed request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, fmt.Errorf("do embedding request: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("error reading embed response: %w", err) + } + + if resp.StatusCode >= 400 { + log.Printf("llm encode error: %s", body) + return nil, fmt.Errorf("%s", body) + } + + var embedding EmbeddingResponse + if err := json.Unmarshal(body, &embedding); err != nil { + return nil, fmt.Errorf("unmarshal tokenize response: %w", err) + } + + return embedding.Embedding, nil +} + +type TokenizeRequest struct { + Content string `json:"content"` +} + +type TokenizeResponse struct { + Tokens []int `json:"tokens"` +} + +func (s *LlamaServer) Tokenize(ctx context.Context, content string) ([]int, error) { + // Make sure the server is ready + status, err := s.getServerStatus(ctx) + if err != nil { + return nil, err + } else if status != ServerStatusReady { + return nil, fmt.Errorf("unexpected server status: %d", status) + } + + data, err := json.Marshal(TokenizeRequest{Content: content}) + if err != nil { + return nil, fmt.Errorf("marshaling encode data: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data)) + if err != nil { + return nil, fmt.Errorf("encode request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, fmt.Errorf("do encode request: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read encode request: %w", err) + } + + if resp.StatusCode >= 400 { + log.Printf("llm encode error: %s", body) + return nil, fmt.Errorf("%s", body) + } + + var encoded TokenizeResponse + if err := json.Unmarshal(body, &encoded); err != nil { + return nil, fmt.Errorf("unmarshal encode response: %w", err) + } + + return encoded.Tokens, nil +} + +type DetokenizeRequest struct { + Tokens []int `json:"tokens"` +} + +type DetokenizeResponse struct { + Content string `json:"content"` +} + +func (s *LlamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) { + // Make sure the server is ready + status, err := s.getServerStatus(ctx) + if err != nil { + return "", err + } else if status != ServerStatusReady { + return "", fmt.Errorf("unexpected server status: %d", status) + } + + data, err := json.Marshal(DetokenizeRequest{Tokens: tokens}) + if err != nil { + return "", fmt.Errorf("marshaling decode data: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data)) + if err != nil { + return "", fmt.Errorf("decode request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return "", fmt.Errorf("do decode request: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("read decode request: %w", err) + } + + if resp.StatusCode >= 400 { + log.Printf("llm decode error: %s", body) + return "", fmt.Errorf("%s", body) + } + + var decoded DetokenizeResponse + if err := json.Unmarshal(body, &decoded); err != nil { + return "", fmt.Errorf("unmarshal encode response: %w", err) + } + + return decoded.Content, nil +} + +func (s *LlamaServer) Close() error { + if s.cmd != nil { + slog.Debug("stopping llama server") + return s.cmd.Process.Kill() + } + + return nil +} + +func parseDurationMs(ms float64) time.Duration { + dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms)) + if err != nil { + panic(err) + } + + return dur +} diff --git a/llm/status.go b/llm/status.go new file mode 100644 index 00000000..8a49bd55 --- /dev/null +++ b/llm/status.go @@ -0,0 +1,42 @@ +package llm + +import ( + "bytes" + "os" +) + +// StatusWriter is a writer that captures error messages from the llama runner process +type StatusWriter struct { + LastErrMsg string + out *os.File +} + +func NewStatusWriter(out *os.File) *StatusWriter { + return &StatusWriter{ + out: out, + } +} + +// TODO - regex matching to detect errors like +// libcublasLt.so.11: cannot open shared object file: No such file or directory + +var errorPrefixes = []string{ + "error:", + "CUDA error", + "cudaMalloc failed", + "\"ERR\"", +} + +func (w *StatusWriter) Write(b []byte) (int, error) { + var errMsg string + for _, prefix := range errorPrefixes { + if _, after, ok := bytes.Cut(b, []byte(prefix)); ok { + errMsg = prefix + string(bytes.TrimSpace(after)) + } + } + if errMsg != "" { + w.LastErrMsg = errMsg + } + + return w.out.Write(b) +} diff --git a/llm/utils.go b/llm/utils.go deleted file mode 100644 index 4dc03c80..00000000 --- a/llm/utils.go +++ /dev/null @@ -1,15 +0,0 @@ -package llm - -import ( - "fmt" - "time" -) - -func parseDurationMs(ms float64) time.Duration { - dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms)) - if err != nil { - panic(err) - } - - return dur -} diff --git a/server/routes.go b/server/routes.go index 62fa86c9..d5b61d6f 100644 --- a/server/routes.go +++ b/server/routes.go @@ -56,12 +56,13 @@ func init() { var loaded struct { mu sync.Mutex - runner llm.LLM + llama *llm.LlamaServer - expireAt time.Time expireTimer *time.Timer - *Model + model string + adapters []string + projectors []string *api.Options } @@ -69,21 +70,28 @@ var defaultSessionDuration = 5 * time.Minute // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.Duration) error { - needLoad := loaded.runner == nil || // is there a model loaded? - loaded.ModelPath != model.ModelPath || // has the base model changed? - !reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed? - !reflect.DeepEqual(loaded.Options.Runner, opts.Runner) // have the runner options changed? + ctx, cancel := context.WithTimeout(c, 10*time.Second) + defer cancel() + + needLoad := loaded.llama == nil || // is there a model loaded? + loaded.model != model.ModelPath || // has the base model changed? + !reflect.DeepEqual(loaded.adapters, model.AdapterPaths) || // have the adapters changed? + !reflect.DeepEqual(loaded.projectors, model.ProjectorPaths) || // have the adapters changed? + !reflect.DeepEqual(loaded.Options.Runner, opts.Runner) || // have the runner options changed? + loaded.llama.Ping(ctx) != nil if needLoad { - if loaded.runner != nil { + if loaded.llama != nil { slog.Info("changing loaded model") - loaded.runner.Close() - loaded.runner = nil - loaded.Model = nil + loaded.llama.Close() + loaded.llama = nil + loaded.model = "" + loaded.adapters = nil + loaded.projectors = nil loaded.Options = nil } - llmRunner, err := llm.New(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts) + llama, err := llm.NewLlamaServer(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts) if err != nil { // some older models are not compatible with newer versions of llama.cpp // show a generalized compatibility error until there is a better way to @@ -95,28 +103,26 @@ func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time. return err } - loaded.Model = model - loaded.runner = llmRunner + loaded.model = model.ModelPath + loaded.adapters = model.AdapterPaths + loaded.projectors = model.ProjectorPaths + loaded.llama = llama loaded.Options = opts } - loaded.expireAt = time.Now().Add(sessionDuration) - if loaded.expireTimer == nil { loaded.expireTimer = time.AfterFunc(sessionDuration, func() { loaded.mu.Lock() defer loaded.mu.Unlock() - if time.Now().Before(loaded.expireAt) { - return + if loaded.llama != nil { + loaded.llama.Close() } - if loaded.runner != nil { - loaded.runner.Close() - } - - loaded.runner = nil - loaded.Model = nil + loaded.llama = nil + loaded.model = "" + loaded.adapters = nil + loaded.projectors = nil loaded.Options = nil }) } @@ -265,7 +271,7 @@ func GenerateHandler(c *gin.Context) { sb.Reset() if req.Context != nil { - prev, err := loaded.runner.Decode(c.Request.Context(), req.Context) + prev, err := loaded.llama.Detokenize(c.Request.Context(), req.Context) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return @@ -286,9 +292,8 @@ func GenerateHandler(c *gin.Context) { go func() { defer close(ch) - fn := func(r llm.PredictResult) { + fn := func(r llm.CompletionResponse) { // Update model expiration - loaded.expireAt = time.Now().Add(sessionDuration) loaded.expireTimer.Reset(sessionDuration) // Build up the full response @@ -322,7 +327,7 @@ func GenerateHandler(c *gin.Context) { } // TODO (jmorganca): encode() should not strip special tokens - tokens, err := loaded.runner.Encode(c.Request.Context(), p) + tokens, err := loaded.llama.Tokenize(c.Request.Context(), p) if err != nil { ch <- gin.H{"error": err.Error()} return @@ -344,13 +349,13 @@ func GenerateHandler(c *gin.Context) { } // Start prediction - predictReq := llm.PredictOpts{ + req := llm.CompletionRequest{ Prompt: prompt, Format: req.Format, Images: images, Options: opts, } - if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil { + if err := loaded.llama.Completion(c.Request.Context(), req, fn); err != nil { ch <- gin.H{"error": err.Error()} } }() @@ -471,7 +476,7 @@ func EmbeddingsHandler(c *gin.Context) { return } - embedding, err := loaded.runner.Embedding(c.Request.Context(), req.Prompt) + embedding, err := loaded.llama.Embedding(c.Request.Context(), req.Prompt) if err != nil { slog.Info(fmt.Sprintf("embedding generation failed: %v", err)) c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"}) @@ -1123,8 +1128,8 @@ func Serve(ln net.Listener) error { signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM) go func() { <-signals - if loaded.runner != nil { - loaded.runner.Close() + if loaded.llama != nil { + loaded.llama.Close() } gpu.Cleanup() os.Exit(0) @@ -1196,7 +1201,7 @@ func streamResponse(c *gin.Context, ch chan any) { // ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, error) { encode := func(s string) ([]int, error) { - return loaded.runner.Encode(ctx, s) + return loaded.llama.Tokenize(ctx, s) } prompt, err := ChatPrompt(template, messages, numCtx, encode) @@ -1326,9 +1331,8 @@ func ChatHandler(c *gin.Context) { go func() { defer close(ch) - fn := func(r llm.PredictResult) { + fn := func(r llm.CompletionResponse) { // Update model expiration - loaded.expireAt = time.Now().Add(sessionDuration) loaded.expireTimer.Reset(sessionDuration) resp := api.ChatResponse{ @@ -1352,14 +1356,12 @@ func ChatHandler(c *gin.Context) { ch <- resp } - // Start prediction - predictReq := llm.PredictOpts{ + if err := loaded.llama.Completion(c.Request.Context(), llm.CompletionRequest{ Prompt: prompt, Format: req.Format, Images: images, Options: opts, - } - if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil { + }, fn); err != nil { ch <- gin.H{"error": err.Error()} } }() diff --git a/server/routes_test.go b/server/routes_test.go index c853e672..7a9afe1e 100644 --- a/server/routes_test.go +++ b/server/routes_test.go @@ -17,7 +17,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/llm" "github.com/ollama/ollama/parser" "github.com/ollama/ollama/version" ) @@ -211,7 +210,7 @@ func Test_Routes(t *testing.T) { }, } - s := Server{} + s := &Server{} router := s.GenerateRoutes() httpSrv := httptest.NewServer(router) @@ -242,27 +241,3 @@ func Test_Routes(t *testing.T) { } } - -type MockLLM struct { - encoding []int -} - -func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error { - return nil -} - -func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) { - return llm.encoding, nil -} - -func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) { - return "", nil -} - -func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) { - return []float64{}, nil -} - -func (llm *MockLLM) Close() { - // do nothing -} From 0a0e9f3e0fa30e49c330cc48932c703d2a4d1e7a Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 19 Mar 2024 09:49:24 +0100 Subject: [PATCH 2/7] Apply 01-cache.diff --- llm/ext_server/server.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 5df5bb47..f5d7863d 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1007,13 +1007,15 @@ struct llama_server_context slot.n_sent_text += result.text_to_send.size(); // add the token to slot queue and cache } - slot.add_token_string(result); + if (slot.params.stream) { send_partial_response(slot, result); } } + slot.add_token_string(result); + if (incomplete) { slot.has_next_token = true; From 4fec5816d6e6b79b91fc0f61ce1927faafc1017a Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 27 Mar 2024 11:02:06 -0700 Subject: [PATCH 3/7] Integration test improvements Cleaner shutdown logic, a bit of response hardening --- integration/basic_test.go | 2 +- integration/utils_test.go | 51 +++++++++++++++++++++++++-------------- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/integration/basic_test.go b/integration/basic_test.go index 926ca52c..40bde03c 100644 --- a/integration/basic_test.go +++ b/integration/basic_test.go @@ -24,5 +24,5 @@ func TestOrcaMiniBlueSky(t *testing.T) { "seed": 123, }, } - GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh"}) + GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh", "scattering"}) } diff --git a/integration/utils_test.go b/integration/utils_test.go index 14ec39dc..0f712271 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -126,7 +126,7 @@ func StartServer(ctx context.Context, ollamaHost string) error { } func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoint, modelName string) error { - slog.Debug("checking status of model", "model", modelName) + slog.Info("checking status of model", "model", modelName) showReq := &api.ShowRequest{Name: modelName} requestJSON, err := json.Marshal(showReq) if err != nil { @@ -174,36 +174,51 @@ func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoin return nil } +var serverProcMutex sync.Mutex + func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client, genReq api.GenerateRequest, anyResp []string) { + + // TODO maybe stuff in an init routine? + lifecycle.InitLogging() + requestJSON, err := json.Marshal(genReq) if err != nil { t.Fatalf("Error serializing request: %v", err) } defer func() { - if t.Failed() && os.Getenv("OLLAMA_TEST_EXISTING") == "" { - // TODO - fp, err := os.Open(lifecycle.ServerLogFile) - if err != nil { - slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err) - return + if os.Getenv("OLLAMA_TEST_EXISTING") == "" { + defer serverProcMutex.Unlock() + if t.Failed() { + fp, err := os.Open(lifecycle.ServerLogFile) + if err != nil { + slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err) + return + } + data, err := io.ReadAll(fp) + if err != nil { + slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err) + return + } + slog.Warn("SERVER LOG FOLLOWS") + os.Stderr.Write(data) + slog.Warn("END OF SERVER") } - data, err := io.ReadAll(fp) - if err != nil { - slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err) - return + err = os.Remove(lifecycle.ServerLogFile) + if err != nil && !os.IsNotExist(err) { + slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err) } - slog.Warn("SERVER LOG FOLLOWS") - os.Stderr.Write(data) - slog.Warn("END OF SERVER") - } - err = os.Remove(lifecycle.ServerLogFile) - if err != nil && !os.IsNotExist(err) { - slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err) } }() scheme, testEndpoint := GetTestEndpoint() if os.Getenv("OLLAMA_TEST_EXISTING") == "" { + serverProcMutex.Lock() + fp, err := os.CreateTemp("", "ollama-server-*.log") + if err != nil { + t.Fatalf("failed to generate log file: %s", err) + } + lifecycle.ServerLogFile = fp.Name() + fp.Close() assert.NoError(t, StartServer(ctx, testEndpoint)) } From 10ed1b6292d552ecb3a062d7ca334f2dd2f75e17 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 28 Mar 2024 09:27:17 -0700 Subject: [PATCH 4/7] Detect too-old cuda driver "cudart init failure: 35" isn't particularly helpful in the logs. --- gpu/gpu_info_cudart.c | 4 ++++ gpu/gpu_info_cudart.h | 1 + 2 files changed, 5 insertions(+) diff --git a/gpu/gpu_info_cudart.c b/gpu/gpu_info_cudart.c index 9f69f845..ef13f5c0 100644 --- a/gpu/gpu_info_cudart.c +++ b/gpu/gpu_info_cudart.c @@ -62,6 +62,10 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) { LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret); UNLOAD_LIBRARY(resp->ch.handle); resp->ch.handle = NULL; + if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) { + resp->err = strdup("your nvidia driver is too old or missing, please upgrade to run ollama"); + return; + } snprintf(buf, buflen, "cudart init failure: %d", ret); resp->err = strdup(buf); return; diff --git a/gpu/gpu_info_cudart.h b/gpu/gpu_info_cudart.h index 476e7555..492704a8 100644 --- a/gpu/gpu_info_cudart.h +++ b/gpu/gpu_info_cudart.h @@ -7,6 +7,7 @@ typedef enum cudartReturn_enum { CUDART_SUCCESS = 0, CUDART_UNSUPPORTED = 1, + CUDA_ERROR_INSUFFICIENT_DRIVER = 35, // Other values omitted for now... } cudartReturn_t; From 0a74cb31d5cc7f3ea51c29db3db726b2816411b5 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 28 Mar 2024 14:26:17 -0700 Subject: [PATCH 5/7] Safeguard for noexec We may have users that run into problems with our current payload model, so this gives us an escape valve. --- docs/troubleshooting.md | 7 +++++++ gpu/assets.go | 15 ++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 7103be4d..b9038e38 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -76,3 +76,10 @@ install script which version to install. ```sh curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh ``` + +## Linux tmp noexec + +If your system is configured with the "noexec" flag where Ollama stores its +temporary executable files, you can specify an alternate location by setting +OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example +OLLAMA_TMPDIR=/usr/share/ollama/ \ No newline at end of file diff --git a/gpu/assets.go b/gpu/assets.go index 539635ee..085c05bc 100644 --- a/gpu/assets.go +++ b/gpu/assets.go @@ -22,11 +22,20 @@ var ( func PayloadsDir() (string, error) { lock.Lock() defer lock.Unlock() + var err error if payloadsDir == "" { cleanupTmpDirs() - tmpDir, err := os.MkdirTemp("", "ollama") - if err != nil { - return "", fmt.Errorf("failed to generate tmp dir: %w", err) + tmpDir := os.Getenv("OLLAMA_TMPDIR") + if tmpDir == "" { + tmpDir, err = os.MkdirTemp("", "ollama") + if err != nil { + return "", fmt.Errorf("failed to generate tmp dir: %w", err) + } + } else { + err = os.MkdirAll(tmpDir, 0755) + if err != nil { + return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err) + } } // Track our pid so we can clean up orphaned tmpdirs From 526d4eb2044158f5a25511e6800d8701e2ff2489 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Sat, 30 Mar 2024 15:34:21 -0700 Subject: [PATCH 6/7] Release gpu discovery library after use Leaving the cudart library loaded kept ~30m of memory pinned in the GPU in the main process. This change ensures we don't hold GPU resources when idle. --- gpu/gpu.go | 26 ++++++++++++++++---------- gpu/gpu_info_cudart.c | 6 ++++++ gpu/gpu_info_cudart.h | 1 + gpu/gpu_info_nvml.c | 7 +++++++ gpu/gpu_info_nvml.h | 1 + 5 files changed, 31 insertions(+), 10 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index dec3f95e..cf2f3b7f 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -35,7 +35,6 @@ const ( ) var gpuMutex sync.Mutex -var gpuHandles *handles = nil // With our current CUDA compile flags, older than 5.0 will not work properly var CudaComputeMin = [2]C.int{5, 0} @@ -85,11 +84,11 @@ var CudartWindowsGlobs = []string{ var CudaTegra string = os.Getenv("JETSON_JETPACK") // Note: gpuMutex must already be held -func initGPUHandles() { +func initGPUHandles() *handles { // TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing - gpuHandles = &handles{nil, nil} + gpuHandles := &handles{nil, nil} var nvmlMgmtName string var nvmlMgmtPatterns []string var cudartMgmtName string @@ -116,7 +115,7 @@ func initGPUHandles() { } cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...) default: - return + return gpuHandles } slog.Info("Detecting GPU type") @@ -126,7 +125,7 @@ func initGPUHandles() { if cudart != nil { slog.Info("Nvidia GPU detected via cudart") gpuHandles.cudart = cudart - return + return gpuHandles } } @@ -137,10 +136,10 @@ func initGPUHandles() { if nvml != nil { slog.Info("Nvidia GPU detected via nvidia-ml") gpuHandles.nvml = nvml - return + return gpuHandles } } - + return gpuHandles } func GetGPUInfo() GpuInfo { @@ -148,9 +147,16 @@ func GetGPUInfo() GpuInfo { // GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries gpuMutex.Lock() defer gpuMutex.Unlock() - if gpuHandles == nil { - initGPUHandles() - } + + gpuHandles := initGPUHandles() + defer func() { + if gpuHandles.nvml != nil { + C.nvml_release(*gpuHandles.nvml) + } + if gpuHandles.cudart != nil { + C.cudart_release(*gpuHandles.cudart) + } + }() // All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX cpuVariant := GetCPUVariant() diff --git a/gpu/gpu_info_cudart.c b/gpu/gpu_info_cudart.c index ef13f5c0..27cd2342 100644 --- a/gpu/gpu_info_cudart.c +++ b/gpu/gpu_info_cudart.c @@ -191,4 +191,10 @@ void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *r } } +void cudart_release(cudart_handle_t h) { + LOG(h.verbose, "releasing cudart library\n"); + UNLOAD_LIBRARY(h.handle); + h.handle = NULL; +} + #endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_cudart.h b/gpu/gpu_info_cudart.h index 492704a8..eb9336ec 100644 --- a/gpu/gpu_info_cudart.h +++ b/gpu/gpu_info_cudart.h @@ -55,6 +55,7 @@ typedef struct cudart_compute_capability { void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp); void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp); void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc); +void cudart_release(cudart_handle_t ch); #endif // __GPU_INFO_CUDART_H__ #endif // __APPLE__ diff --git a/gpu/gpu_info_nvml.c b/gpu/gpu_info_nvml.c index aacf0410..67c80b0f 100644 --- a/gpu/gpu_info_nvml.c +++ b/gpu/gpu_info_nvml.c @@ -211,4 +211,11 @@ void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) { } } } + +void nvml_release(nvml_handle_t h) { + LOG(h.verbose, "releasing nvml library\n"); + UNLOAD_LIBRARY(h.handle); + h.handle = NULL; +} + #endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_nvml.h b/gpu/gpu_info_nvml.h index 819e41fd..bd1d6001 100644 --- a/gpu/gpu_info_nvml.h +++ b/gpu/gpu_info_nvml.h @@ -51,6 +51,7 @@ typedef struct nvml_compute_capability { void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp); void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp); void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc); +void nvml_release(nvml_handle_t ch); #endif // __GPU_INFO_NVML_H__ #endif // __APPLE__ \ No newline at end of file From 1f11b525111ea89bf325ad6fda3e3a13d6396a50 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 1 Apr 2024 16:47:33 -0700 Subject: [PATCH 7/7] Refined min memory from testing --- gpu/gpu.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index cf2f3b7f..708fad10 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -30,8 +30,8 @@ type handles struct { } const ( - cudaMinimumMemory = 377 * format.MebiByte - rocmMinimumMemory = 377 * format.MebiByte + cudaMinimumMemory = 457 * format.MebiByte + rocmMinimumMemory = 457 * format.MebiByte ) var gpuMutex sync.Mutex