From 811b1f03c8270baa8004aff9e977ed3d18593661 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Fri, 24 Nov 2023 13:58:09 -0500 Subject: [PATCH 01/19] deprecate ggml - remove ggml runner - automatically pull gguf models when ggml detected - tell users to update to gguf in the case automatic pull fails Co-Authored-By: Jeffrey Morgan --- .dockerignore | 1 - .gitmodules | 5 -- cmd/cmd.go | 24 ++++- docs/modelfile.md | 2 +- llm/ggml.go | 78 +--------------- llm/llama.cpp/generate_darwin_amd64.go | 9 -- llm/llama.cpp/generate_darwin_arm64.go | 9 -- llm/llama.cpp/generate_linux.go | 12 --- llm/llama.cpp/generate_windows.go | 7 -- llm/llama.cpp/ggml | 1 - .../0001-add-detokenize-endpoint.patch | 51 ----------- .../patches/0002-34B-model-support.patch | 89 ------------------- ...onization-in-new-matrix-multiplicati.patch | 30 ------- ...dd-missing-barriers-for-mul-mat-2699.patch | 41 --------- ...DA-s-half-type-for-aarch64-1455-2670.patch | 32 ------- llm/llama.go | 23 +++-- llm/llm.go | 16 +--- server/images.go | 35 +++++++- server/routes.go | 2 +- 19 files changed, 74 insertions(+), 393 deletions(-) delete mode 160000 llm/llama.cpp/ggml delete mode 100644 llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch delete mode 100644 llm/llama.cpp/patches/0002-34B-model-support.patch delete mode 100644 llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch delete mode 100644 llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch delete mode 100644 llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch diff --git a/.dockerignore b/.dockerignore index 150c8f6e..116c58f7 100644 --- a/.dockerignore +++ b/.dockerignore @@ -3,7 +3,6 @@ ollama app dist scripts -llm/llama.cpp/ggml llm/llama.cpp/gguf .env .cache diff --git a/.gitmodules b/.gitmodules index 49a54fa9..5a65e4df 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,8 +1,3 @@ -[submodule "llm/llama.cpp/ggml"] - path = llm/llama.cpp/ggml - url = https://github.com/ggerganov/llama.cpp.git - ignore = dirty - shallow = true [submodule "llm/llama.cpp/gguf"] path = llm/llama.cpp/gguf url = https://github.com/ggerganov/llama.cpp.git diff --git a/cmd/cmd.go b/cmd/cmd.go index b0ba416b..23d9c120 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -572,10 +572,30 @@ func generate(cmd *cobra.Command, opts generateOptions) error { } if err := client.Generate(ctx, &request, fn); err != nil { - if errors.Is(err, context.Canceled) { + switch { + case errors.Is(err, context.Canceled): return nil + case strings.Contains(err.Error(), "unsupported model format"): + // pull and retry to see if the model has been updated + parts := strings.Split(opts.Model, string(os.PathSeparator)) + if len(parts) == 1 { + // this is a library model, log some info + fmt.Fprintln(os.Stderr, "This model is no longer compatible with Ollama. Pulling a new version...") + } + if err := PullHandler(cmd, []string{opts.Model}); err != nil { + fmt.Printf("Error: %s\n", err) + return fmt.Errorf("unsupported model, please update this model to gguf format") // relay the original error + } + // retry + if err := client.Generate(ctx, &request, fn); err != nil { + if errors.Is(err, context.Canceled) { + return nil + } + return err + } + default: + return err } - return err } if opts.Prompt != "" { fmt.Println() diff --git a/docs/modelfile.md b/docs/modelfile.md index 20113090..80e896eb 100644 --- a/docs/modelfile.md +++ b/docs/modelfile.md @@ -188,7 +188,7 @@ SYSTEM """""" ### ADAPTER -The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined. +The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGUF file format. The adapter should be tuned from the base model otherwise the behaviour is undefined. ```modelfile ADAPTER ./ollama-lora.bin diff --git a/llm/ggml.go b/llm/ggml.go index 8d421e03..f71328e1 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -86,74 +86,6 @@ type container interface { Decode(*readSeekOffset) (model, error) } -type containerGGML struct{} - -func (c *containerGGML) Name() string { - return "ggml" -} - -func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) { - // file contents aren't decoded - ro.Seek(0, io.SeekEnd) - return nil, nil -} - -type containerGGMF struct { - version uint32 -} - -func (c *containerGGMF) Name() string { - return "ggmf" -} - -func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) { - var version uint32 - binary.Read(ro, binary.LittleEndian, &version) - - switch version { - case 1: - default: - return nil, errors.New("invalid version") - } - - c.version = version - - // remaining file contents aren't decoded - ro.Seek(0, io.SeekEnd) - - return nil, nil -} - -type containerGGJT struct { - version uint32 -} - -func (c *containerGGJT) Name() string { - return "ggjt" -} - -func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) { - var version uint32 - binary.Read(ro, binary.LittleEndian, &version) - - switch version { - case 1, 2, 3: - default: - return nil, errors.New("invalid version") - } - - c.version = version - - // different model types may have different layouts for hyperparameters - var llama llamaModel - binary.Read(ro, binary.LittleEndian, &llama.hyperparameters) - - // remaining file contents aren't decoded - ro.Seek(0, io.SeekEnd) - - return &llama, nil -} - type containerLORA struct { version uint32 } @@ -194,6 +126,8 @@ const ( FILE_MAGIC_GGUF_BE = 0x47475546 ) +var ErrUnsupportedFormat = errors.New("unsupported model format") + func DecodeGGML(r io.ReadSeeker) (*GGML, error) { ro := readSeekOffset{ReadSeeker: r} @@ -204,12 +138,8 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) { var c container switch magic { - case FILE_MAGIC_GGML: - c = &containerGGML{} - case FILE_MAGIC_GGMF: - c = &containerGGMF{} - case FILE_MAGIC_GGJT: - c = &containerGGJT{} + case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT: + return nil, ErrUnsupportedFormat case FILE_MAGIC_GGLA: c = &containerLORA{} case FILE_MAGIC_GGUF_LE: diff --git a/llm/llama.cpp/generate_darwin_amd64.go b/llm/llama.cpp/generate_darwin_amd64.go index 65a53386..fed45fd9 100644 --- a/llm/llama.cpp/generate_darwin_amd64.go +++ b/llm/llama.cpp/generate_darwin_amd64.go @@ -2,15 +2,6 @@ package llm //go:generate git submodule init -//go:generate git submodule update --force ggml -//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch -//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch -//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch -//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch -//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -//go:generate cmake --build ggml/build/cpu --target server --config Release -//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner - //go:generate git submodule update --force gguf //go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on diff --git a/llm/llama.cpp/generate_darwin_arm64.go b/llm/llama.cpp/generate_darwin_arm64.go index 81fd8914..0c33bc51 100644 --- a/llm/llama.cpp/generate_darwin_arm64.go +++ b/llm/llama.cpp/generate_darwin_arm64.go @@ -2,15 +2,6 @@ package llm //go:generate git submodule init -//go:generate git submodule update --force ggml -//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch -//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch -//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch -//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch -//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -//go:generate cmake --build ggml/build/metal --target server --config Release -//go:generate mv ggml/build/metal/bin/server ggml/build/metal/bin/ollama-runner - //go:generate git submodule update --force gguf //go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch //go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 diff --git a/llm/llama.cpp/generate_linux.go b/llm/llama.cpp/generate_linux.go index ce9e78a5..e67ca21a 100644 --- a/llm/llama.cpp/generate_linux.go +++ b/llm/llama.cpp/generate_linux.go @@ -2,15 +2,6 @@ package llm //go:generate git submodule init -//go:generate git submodule update --force ggml -//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch -//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch -//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch -//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch -//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/cpu --target server --config Release -//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner - //go:generate git submodule update --force gguf //go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch //go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch @@ -18,9 +9,6 @@ package llm //go:generate cmake --build gguf/build/cpu --target server --config Release //go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner -//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/cuda --target server --config Release -//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner //go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0 //go:generate cmake --build gguf/build/cuda --target server --config Release //go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner diff --git a/llm/llama.cpp/generate_windows.go b/llm/llama.cpp/generate_windows.go index 2fb4c39f..6cd9566f 100644 --- a/llm/llama.cpp/generate_windows.go +++ b/llm/llama.cpp/generate_windows.go @@ -2,13 +2,6 @@ package llm //go:generate git submodule init -//go:generate git submodule update --force ggml -//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch -//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch -//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/cpu --target server --config Release -//go:generate cmd /c move ggml\build\cpu\bin\Release\server.exe ggml\build\cpu\bin\Release\ollama-runner.exe - //go:generate git submodule update --force gguf //go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off diff --git a/llm/llama.cpp/ggml b/llm/llama.cpp/ggml deleted file mode 160000 index 9e232f02..00000000 --- a/llm/llama.cpp/ggml +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9e232f0234073358e7031c1b8d7aa45020469a3b diff --git a/llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch b/llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch deleted file mode 100644 index 34ea0e2d..00000000 --- a/llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch +++ /dev/null @@ -1,51 +0,0 @@ -From 032ef7ff2423f5117bb59d42fb71be9cebf0a2de Mon Sep 17 00:00:00 2001 -From: Bruce MacDonald -Date: Mon, 28 Aug 2023 18:08:12 -0400 -Subject: [PATCH] add detokenize endpoint - ---- - examples/server/server.cpp | 21 +++++++++++++++++++++ - 1 file changed, 21 insertions(+) - -diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index 9966045..5014691 100644 ---- a/examples/server/server.cpp -+++ b/examples/server/server.cpp -@@ -1075,6 +1075,12 @@ static json format_tokenizer_response(const std::vector &tokens) - {"tokens", tokens}}; - } - -+static json format_detokenized_response(std::string content) -+{ -+ return json{ -+ {"content", content}}; -+} -+ - static void parse_options_completion(const json &body, llama_server_context &llama) - { - gpt_params default_params; -@@ -1361,6 +1367,21 @@ int main(int argc, char **argv) - const json data = format_tokenizer_response(tokens); - return res.set_content(data.dump(), "application/json"); }); - -+ svr.Post("/detokenize", [&llama](const Request &req, Response &res) -+ { -+ auto lock = llama.lock(); -+ -+ const json body = json::parse(req.body); -+ std::string content; -+ if (body.count("tokens") != 0) -+ { -+ const std::vector tokens = body["tokens"]; -+ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend()); -+ } -+ -+ const json data = format_detokenized_response(content); -+ return res.set_content(data.dump(), "application/json"); }); -+ - svr.Post("/embedding", [&llama](const Request &req, Response &res) - { - auto lock = llama.lock(); --- -2.39.2 (Apple Git-143) - diff --git a/llm/llama.cpp/patches/0002-34B-model-support.patch b/llm/llama.cpp/patches/0002-34B-model-support.patch deleted file mode 100644 index 275a6cbb..00000000 --- a/llm/llama.cpp/patches/0002-34B-model-support.patch +++ /dev/null @@ -1,89 +0,0 @@ -From 6145068a6613c37bb43a7408b5496524bdcfc402 Mon Sep 17 00:00:00 2001 -From: Bruce MacDonald -Date: Mon, 28 Aug 2023 18:08:53 -0400 -Subject: [PATCH] 34B model support - ---- - llama.cpp | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/llama.cpp b/llama.cpp -index f2cbe76..62c5cdf 100644 ---- a/llama.cpp -+++ b/llama.cpp -@@ -79,6 +79,7 @@ enum e_model { - MODEL_7B, - MODEL_13B, - MODEL_30B, -+ MODEL_34B, - MODEL_65B, - MODEL_70B, - }; -@@ -122,6 +123,7 @@ static std::map MEM_REQ_SCRATCH0(int n_ctx) - { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB }, - { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB }, - { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB }, -+ { MODEL_34B, ((size_t) n_ctx / 9ull + 160ull) * MB }, - { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess - { MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB }, - }; -@@ -135,6 +137,7 @@ static const std::map & MEM_REQ_SCRATCH1() - { MODEL_7B, 160ull * MB }, - { MODEL_13B, 192ull * MB }, - { MODEL_30B, 256ull * MB }, -+ { MODEL_34B, 256ull * MB }, - { MODEL_65B, 384ull * MB }, // guess - { MODEL_70B, 304ull * MB }, - }; -@@ -149,6 +152,7 @@ static const std::map & MEM_REQ_EVAL() - { MODEL_7B, 10ull * MB }, - { MODEL_13B, 12ull * MB }, - { MODEL_30B, 16ull * MB }, -+ { MODEL_34B, 16ull * MB }, - { MODEL_65B, 24ull * MB }, // guess - { MODEL_70B, 24ull * MB }, - }; -@@ -164,6 +168,7 @@ static const std::map & VRAM_REQ_SCRATCH_BASE() - { MODEL_7B, 512ull * kB }, - { MODEL_13B, 640ull * kB }, - { MODEL_30B, 768ull * kB }, -+ { MODEL_34B, 768ull * kB }, - { MODEL_65B, 1280ull * kB }, - { MODEL_70B, 1280ull * kB }, - }; -@@ -179,6 +184,7 @@ static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() - { MODEL_7B, 128ull }, - { MODEL_13B, 160ull }, - { MODEL_30B, 208ull }, -+ { MODEL_34B, 208ull }, - { MODEL_65B, 256ull }, - { MODEL_70B, 256ull }, - }; -@@ -1027,6 +1033,7 @@ static const char * llama_model_type_name(e_model type) { - case MODEL_7B: return "7B"; - case MODEL_13B: return "13B"; - case MODEL_30B: return "30B"; -+ case MODEL_34B: return "34B"; - case MODEL_65B: return "65B"; - case MODEL_70B: return "70B"; - default: LLAMA_ASSERT(false); -@@ -1074,6 +1081,7 @@ static void llama_model_load_internal( - case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_13B; break; -+ case 48: model.type = e_model::MODEL_34B; break; - case 60: model.type = e_model::MODEL_30B; break; - case 80: model.type = e_model::MODEL_65B; break; - default: -@@ -1094,6 +1102,8 @@ static void llama_model_load_internal( - LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa); - model.type = e_model::MODEL_70B; - hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model -+ } else if (model.type == e_model::MODEL_34B && n_gqa == 8) { -+ hparams.f_ffn_mult = 1.0f; // from the params.json of the 34B model - } - - hparams.rope_freq_base = rope_freq_base; --- -2.39.2 (Apple Git-143) - diff --git a/llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch b/llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch deleted file mode 100644 index e5540ab1..00000000 --- a/llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch +++ /dev/null @@ -1,30 +0,0 @@ -From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001 -From: Shouzheng Liu -Date: Mon, 21 Aug 2023 06:59:29 -0400 -Subject: [PATCH] metal : fix synchronization in new matrix multiplication - kernel (#2686) - ---- - ggml-metal.metal | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/ggml-metal.metal b/ggml-metal.metal -index 3f31252..88d48f6 100644 ---- a/ggml-metal.metal -+++ b/ggml-metal.metal -@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0, - threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ - + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; - for (int i = 0; i < 8; i++) { -+ threadgroup_barrier(mem_flags::mem_device); - simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); - } - -- threadgroup_barrier(mem_flags::mem_threadgroup); -+ threadgroup_barrier(mem_flags::mem_device); - device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; - if (sgitg==0) { - for (int i = 0; i < n_rows; i++) { --- -2.41.0 - diff --git a/llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch b/llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch deleted file mode 100644 index a2649097..00000000 --- a/llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001 -From: Shouzheng Liu -Date: Tue, 22 Aug 2023 02:18:40 -0400 -Subject: [PATCH] metal : add missing barriers for mul-mat (#2699) - ---- - ggml-metal.metal | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/ggml-metal.metal b/ggml-metal.metal -index 88d48f6..ce3541f 100644 ---- a/ggml-metal.metal -+++ b/ggml-metal.metal -@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0, - //load data and store to threadgroup memory - half4x4 temp_a; - dequantize_func(x, il, temp_a); -+ threadgroup_barrier(mem_flags::mem_threadgroup); - #pragma unroll(16) - for (int i = 0; i < 16; i++) { - *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ -@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0, - } - } else { - // block is smaller than 64x32, we should avoid writing data outside of the matrix -+ threadgroup_barrier(mem_flags::mem_threadgroup); - threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ - + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; - for (int i = 0; i < 8; i++) { -- threadgroup_barrier(mem_flags::mem_device); - simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); - } - -- threadgroup_barrier(mem_flags::mem_device); -+ threadgroup_barrier(mem_flags::mem_threadgroup); - device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; - if (sgitg==0) { - for (int i = 0; i < n_rows; i++) { --- -2.41.0 - diff --git a/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch b/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch deleted file mode 100644 index 7b67f680..00000000 --- a/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001 -From: Kylin <56434533+KyL0N@users.noreply.github.com> -Date: Tue, 22 Aug 2023 15:14:23 +0800 -Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670) - -* ggml: support CUDA's half type for aarch64(#1455) -support CUDA's half type for aarch64 in ggml_fp16_t definition - -* ggml: use __CUDACC__ to recognise nvcc compiler ---- - ggml.h | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/ggml.h b/ggml.h -index 544ad2d..0ec7ec5 100644 ---- a/ggml.h -+++ b/ggml.h -@@ -259,8 +259,9 @@ - extern "C" { - #endif - --#ifdef __ARM_NEON -- // we use the built-in 16-bit float type -+#if defined(__ARM_NEON) && defined(__CUDACC__) -+ typedef half ggml_fp16_t; -+#elif defined(__ARM_NEON) - typedef __fp16 ggml_fp16_t; - #else - typedef uint16_t ggml_fp16_t; --- -2.39.2 (Apple Git-143) - diff --git a/llm/llama.go b/llm/llama.go index adaa4c57..574c24ff 100644 --- a/llm/llama.go +++ b/llm/llama.go @@ -59,13 +59,12 @@ ws ::= ([ \t\n] ws)? var llamaCppEmbed embed.FS type ModelRunner struct { - Type string // "gguf" or "ggml" Path string // path to the model runner executable Accelerated bool } -func chooseRunners(workDir, runnerType string) []ModelRunner { - buildPath := path.Join("llama.cpp", runnerType, "build") +func chooseRunners(workDir string) []ModelRunner { + buildPath := path.Join("llama.cpp", "gguf", "build") var runners []ModelRunner // set the runners based on the OS @@ -73,25 +72,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner { switch runtime.GOOS { case "darwin": if runtime.GOARCH == "arm64" { - runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}} + runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}} } else { - runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}} + runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}} } case "linux": runners = []ModelRunner{ - {Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true}, - {Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}, + {Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true}, + {Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}, } case "windows": // TODO: select windows GPU runner here when available runners = []ModelRunner{ - {Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true}, - {Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")}, + {Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true}, + {Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")}, } default: log.Printf("unknown OS, running on CPU: %s", runtime.GOOS) runners = []ModelRunner{ - {Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}, + {Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}, } } @@ -141,7 +140,7 @@ func chooseRunners(workDir, runnerType string) []ModelRunner { } } if !runnerAvailable { - log.Fatalf("%s runner not found", runnerType) + log.Fatalf("gguf runner not found") } // return the runners to try in priority order @@ -149,7 +148,6 @@ func chooseRunners(workDir, runnerType string) []ModelRunner { for _, r := range runners { // clean the ModelRunner paths so that they match the OS we are running on localRunnersByPriority = append(localRunnersByPriority, ModelRunner{ - Type: r.Type, Path: filepath.Clean(path.Join(workDir, r.Path)), Accelerated: r.Accelerated, }) @@ -350,6 +348,7 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner "--batch-size", fmt.Sprintf("%d", opts.NumBatch), "--n-gpu-layers", fmt.Sprintf("%d", numGPU), "--embedding", + "--parallel", "2", } if opts.MainGPU > 0 { diff --git a/llm/llm.go b/llm/llm.go index fc6bbfd7..92bffd76 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -76,16 +76,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) } } - switch ggml.Name() { - case "gguf": - // TODO: gguf will load these options automatically from the model binary - opts.NumGQA = 0 - opts.RopeFrequencyBase = 0.0 - opts.RopeFrequencyScale = 0.0 - return newLlama(model, adapters, projectors, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts) - case "ggml", "ggmf", "ggjt", "ggla": - return newLlama(model, adapters, projectors, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts) - default: - return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily()) - } + opts.NumGQA = 0 + opts.RopeFrequencyBase = 0.0 + opts.RopeFrequencyScale = 0.0 + return newLlama(model, adapters, projectors, chooseRunners(workDir), ggml.NumLayers(), opts) } diff --git a/server/images.go b/server/images.go index 9442bd74..8e7b3343 100644 --- a/server/images.go +++ b/server/images.go @@ -418,6 +418,27 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars return err } + // if the model is not in gguf format, pull the base model to try and get it in gguf format + if fromConfig.ModelFormat != "gguf" { + fn(api.ProgressResponse{Status: "updating base model"}) + if err := PullModel(ctx, c.Args, &RegistryOptions{}, fn); err != nil { + log.Printf("error pulling model: %v", err) + } + // Reset the file pointer to the beginning of the file + _, err = fromConfigFile.Seek(0, 0) + if err != nil { + return fmt.Errorf("update from config after pull: %w", err) + } + if err := json.NewDecoder(fromConfigFile).Decode(&fromConfig); err != nil { + return err + } + } + + // if the model is still not in gguf format, error out + if fromConfig.ModelFormat != "gguf" { + return fmt.Errorf("%s is not in gguf format, this base model is not compatible with this version of ollama", c.Args) + } + config.SetModelFormat(fromConfig.ModelFormat) config.SetModelFamily(append(fromConfig.ModelFamilies, fromConfig.ModelFamily)...) config.SetModelType(fromConfig.ModelType) @@ -456,15 +477,21 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars defer bin.Close() var offset int64 + CREATE: for { fn(api.ProgressResponse{Status: "creating model layer"}) bin.Seek(offset, io.SeekStart) ggml, err := llm.DecodeGGML(bin) - if errors.Is(err, io.EOF) { - break - } else if err != nil { - return err + if err != nil { + switch { + case errors.Is(err, io.EOF): + break CREATE + case errors.Is(err, llm.ErrUnsupportedFormat): + return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err) + default: + return err + } } config.SetModelFormat(ggml.Name()) diff --git a/server/routes.go b/server/routes.go index b9bd5447..12213606 100644 --- a/server/routes.go +++ b/server/routes.go @@ -114,7 +114,7 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess // some older models are not compatible with newer versions of llama.cpp // show a generalized compatibility error until there is a better way to // check for model compatibility - if strings.Contains(err.Error(), "failed to load model") { + if errors.Is(llm.ErrUnsupportedFormat, err) || strings.Contains(err.Error(), "failed to load model") { err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, model.ShortName) } From 5e7fd6906f4653fa671aa5d2e2d4dd5bdf17fd36 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Mon, 11 Dec 2023 15:35:31 -0500 Subject: [PATCH 02/19] Update images.go --- server/images.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/server/images.go b/server/images.go index 8e7b3343..006a91d3 100644 --- a/server/images.go +++ b/server/images.go @@ -421,7 +421,11 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars // if the model is not in gguf format, pull the base model to try and get it in gguf format if fromConfig.ModelFormat != "gguf" { fn(api.ProgressResponse{Status: "updating base model"}) - if err := PullModel(ctx, c.Args, &RegistryOptions{}, fn); err != nil { + parent, err := GetModel(c.Args) + if err != nil { + return err + } + if err := PullModel(ctx, parent.OriginalModel, &RegistryOptions{}, fn); err != nil { log.Printf("error pulling model: %v", err) } // Reset the file pointer to the beginning of the file From d4cd6957598ba6a3a1bb4e2660ee24b82e2541da Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 13 Nov 2023 17:20:34 -0800 Subject: [PATCH 03/19] Add cgo implementation for llama.cpp Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions. --- .dockerignore | 2 +- .gitignore | 3 +- go.mod | 2 +- go.sum | 3 +- llm/ext_server.go | 325 +++++++++ llm/gpu_cuda.go | 57 ++ llm/gpu_darwin.go | 19 + llm/llama.cpp/gen_common.sh | 34 + llm/llama.cpp/gen_darwin.sh | 36 + llm/llama.cpp/gen_linux.sh | 17 + llm/llama.cpp/gen_windows.ps1 | 51 ++ llm/llama.cpp/generate_darwin.go | 3 + llm/llama.cpp/generate_darwin_amd64.go | 9 - llm/llama.cpp/generate_darwin_arm64.go | 9 - llm/llama.cpp/generate_linux.go | 13 +- llm/llama.cpp/generate_windows.go | 16 +- .../0001-Expose-callable-API-for-server.patch | 422 +++++++++++ .../0001-copy-cuda-runtime-libraries.patch | 27 - .../0001-update-default-log-target.patch | 25 - llm/llama.go | 656 ------------------ llm/llm.go | 3 +- scripts/build_darwin.sh | 2 +- scripts/build_linux.sh | 2 +- scripts/setup_integration_tests.sh | 35 + server/llm_test.go | 103 +++ server/llm_utils_test.go | 76 ++ server/routes.go | 4 - 27 files changed, 1189 insertions(+), 765 deletions(-) create mode 100644 llm/ext_server.go create mode 100644 llm/gpu_cuda.go create mode 100644 llm/gpu_darwin.go create mode 100644 llm/llama.cpp/gen_common.sh create mode 100755 llm/llama.cpp/gen_darwin.sh create mode 100755 llm/llama.cpp/gen_linux.sh create mode 100644 llm/llama.cpp/gen_windows.ps1 create mode 100644 llm/llama.cpp/generate_darwin.go delete mode 100644 llm/llama.cpp/generate_darwin_amd64.go delete mode 100644 llm/llama.cpp/generate_darwin_arm64.go create mode 100644 llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch delete mode 100644 llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch delete mode 100644 llm/llama.cpp/patches/0001-update-default-log-target.patch create mode 100755 scripts/setup_integration_tests.sh create mode 100644 server/llm_test.go create mode 100644 server/llm_utils_test.go diff --git a/.dockerignore b/.dockerignore index 116c58f7..f27e7fd6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,7 +2,7 @@ ollama app dist -scripts llm/llama.cpp/gguf .env .cache +test_data \ No newline at end of file diff --git a/.gitignore b/.gitignore index 30dd3173..97f73481 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ ollama ggml-metal.metal .cache *.exe -.idea \ No newline at end of file +.idea +test_data \ No newline at end of file diff --git a/go.mod b/go.mod index 1bba54f9..0df1372b 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/gin-gonic/gin v1.9.1 github.com/olekukonko/tablewriter v0.0.5 github.com/spf13/cobra v1.7.0 - github.com/stretchr/testify v1.8.3 + github.com/stretchr/testify v1.8.4 golang.org/x/sync v0.3.0 ) diff --git a/go.sum b/go.sum index 59e1590a..ff6bcbd9 100644 --- a/go.sum +++ b/go.sum @@ -98,8 +98,9 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY= github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI= github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6M= diff --git a/llm/ext_server.go b/llm/ext_server.go new file mode 100644 index 00000000..6e31dca7 --- /dev/null +++ b/llm/ext_server.go @@ -0,0 +1,325 @@ +package llm + +/* +#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common +#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 +#cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds +#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable +#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE +#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE +#cgo darwin,arm64 CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG +#cgo darwin LDFLAGS: -lc++ -framework Accelerate +#cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders +#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/common/libcommon.a +#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/examples/server/libext_server.a +#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libllama.a +#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libggml_static.a +#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a +#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a +#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a +#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a +#cgo linux CFLAGS: -D_GNU_SOURCE +#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS +#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/examples/server/libext_server.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a +#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a +#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a +#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a +#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudadevrt.a +#cgo linux LDFLAGS: /usr/local/cuda/lib64/libculibos.a +#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm +#cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin +#cgo windows LDFLAGS: -lext_server_shared -lpthread + +#include +#include "examples/server/server.h" + +*/ +import "C" +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "log" + "os" + "runtime" + "sync" + "time" + "unsafe" + + "github.com/jmorganca/ollama/api" +) + +func errWrap(resp C.ext_server_err) error { + if resp.code == 0 { + return nil + } + err := fmt.Errorf(C.GoString(resp.err)) + C.free(unsafe.Pointer(resp.err)) + return err +} + +type llamaExtServer struct { + api.Options +} + +// Note: current implementation does not support concurrent instantiations +var mutex sync.Mutex + +func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (*llamaExtServer, error) { + if !mutex.TryLock() { + log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete") + mutex.Lock() + } + server := &llamaExtServer{opts} + fileInfo, err := os.Stat(model) + if err != nil { + return nil, err + } + var sparams C.ext_server_params + sparams.model = C.CString(model) + defer C.free(unsafe.Pointer(sparams.model)) + + numGPU := NumGPU(numLayers, fileInfo.Size(), opts) + + sparams.embedding = true + sparams.n_ctx = C.uint(opts.NumCtx) + sparams.n_batch = C.uint(opts.NumBatch) + sparams.n_gpu_layers = C.int(numGPU) + sparams.main_gpu = C.int(opts.MainGPU) + sparams.n_parallel = 2 // TODO - wire up concurrency + + // Always use the value encoded in the model + sparams.rope_freq_base = 0.0 + sparams.rope_freq_scale = 0.0 + + sparams.lora_adapters = nil + for i := 0; i < len(adapters); i++ { + la := (*C.ext_server_lora_adapter)(C.malloc(C.sizeof_struct_ext_server_lora_adapter)) + defer C.free(unsafe.Pointer(la)) + la.adapter = C.CString(adapters[i]) + defer C.free(unsafe.Pointer(la.adapter)) + la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX + la.next = nil + if i == 0 { + sparams.lora_adapters = la + } else { + tmp := sparams.lora_adapters + for ; tmp.next != nil; tmp = tmp.next { + } + tmp.next = la + } + } + + // TODO - implement ME + // if len(projectors) > 0 { + // // TODO: applying multiple projectors is not supported by the llama.cpp server yet + // params = append(params, "--mmproj", projectors[0]) + // } + + if opts.NumThread > 0 { + sparams.n_threads = C.uint(opts.NumThread) + } else { + sparams.n_threads = C.uint(runtime.NumCPU()) + } + + sparams.memory_f16 = false + if opts.F16KV { + sparams.memory_f16 = true + } + sparams.use_mlock = false + if opts.UseMLock { + sparams.use_mlock = true + } + sparams.use_mmap = true + if !opts.UseMMap { + sparams.use_mmap = false + } + sparams.numa = false + if opts.UseNUMA { + sparams.numa = true + } + + log.Printf("Initializing internal llama server") + err = errWrap(C.llama_server_init(&sparams)) + if err != nil { + return nil, err + } + + log.Printf("Starting internal llama main loop") + C.llama_server_start() + return server, nil +} + +func (llm *llamaExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error { + + request := map[string]any{ + "prompt": predict.Prompt, + "stream": true, + "n_predict": llm.NumPredict, + "n_keep": llm.NumKeep, + "temperature": llm.Temperature, + "top_k": llm.TopK, + "top_p": llm.TopP, + "tfs_z": llm.TFSZ, + "typical_p": llm.TypicalP, + "repeat_last_n": llm.RepeatLastN, + "repeat_penalty": llm.RepeatPenalty, + "presence_penalty": llm.PresencePenalty, + "frequency_penalty": llm.FrequencyPenalty, + "mirostat": llm.Mirostat, + "mirostat_tau": llm.MirostatTau, + "mirostat_eta": llm.MirostatEta, + "penalize_nl": llm.PenalizeNewline, + "seed": llm.Seed, + "stop": llm.Stop, + } + + if predict.Format == "json" { + request["grammar"] = jsonGrammar + } + + // Handling JSON marshaling with special characters unescaped. + buffer := &bytes.Buffer{} + enc := json.NewEncoder(buffer) + enc.SetEscapeHTML(false) + + if err := enc.Encode(request); err != nil { + return fmt.Errorf("failed to marshal data: %w", err) + } + + req := C.CString(buffer.String()) + defer C.free(unsafe.Pointer(req)) + + cmpCtx := C.llama_server_completion(req) + if cmpCtx.task_id < 0 { + defer C.free(unsafe.Pointer(cmpCtx.err)) + return fmt.Errorf(C.GoString(cmpCtx.err)) + } + + for { + select { + case <-ctx.Done(): + // This handles the request cancellation + return errWrap(C.llama_server_completion_cancel(cmpCtx.task_id)) + default: + result := C.llama_server_completion_next_result(cmpCtx.task_id) + if result.result_json != nil { + defer C.free(unsafe.Pointer(result.result_json)) + } + var p prediction + if err := json.Unmarshal([]byte(C.GoString(result.result_json)), &p); err != nil { + err2 := errWrap(C.llama_server_completion_cancel(cmpCtx.task_id)) + return errors.Join(fmt.Errorf("error unmarshaling llm prediction response: %w", err), err2) + } + + if p.Content != "" { + fn(PredictResult{ + // Model: predict.Model, // XXX remove or replace? + CreatedAt: time.Now().UTC(), + Content: p.Content, + }) + } + + if p.Stop { + fn(PredictResult{ + // Model: predict.Model, // XXX remove or replace? + CreatedAt: time.Now().UTC(), + TotalDuration: time.Since(predict.CheckpointStart), + Done: true, + PromptEvalCount: p.Timings.PromptN, + PromptEvalDuration: parseDurationMs(p.Timings.PromptMS), + EvalCount: p.Timings.PredictedN, + EvalDuration: parseDurationMs(p.Timings.PredictedMS), + }) + return nil + } + } + } +} + +func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { + data, err := json.Marshal(TokenizeRequest{Content: prompt}) + if err != nil { + return nil, fmt.Errorf("marshaling encode data: %w", err) + } + req := C.CString(string(data)) + defer C.free(unsafe.Pointer(req)) + var resp C.ext_server_resp + err = errWrap(C.llama_server_tokenize(req, &resp)) + if resp.json_resp != nil { + defer C.free(unsafe.Pointer(resp.json_resp)) + } + + var encoded TokenizeResponse + if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &encoded); err2 != nil { + return nil, fmt.Errorf("unmarshal encode response: %w", err2) + } + + return encoded.Tokens, err +} + +func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) { + if len(tokens) == 0 { + return "", nil + } + data, err := json.Marshal(DetokenizeRequest{Tokens: tokens}) + if err != nil { + return "", fmt.Errorf("marshaling decode data: %w", err) + } + + req := C.CString(string(data)) + defer C.free(unsafe.Pointer(req)) + var resp C.ext_server_resp + err = errWrap(C.llama_server_detokenize(req, &resp)) + if resp.json_resp != nil { + defer C.free(unsafe.Pointer(resp.json_resp)) + } + + var decoded DetokenizeResponse + if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &decoded); err2 != nil { + return "", fmt.Errorf("unmarshal encode response: %w", err2) + } + + return decoded.Content, err +} + +func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) { + data, err := json.Marshal(TokenizeRequest{Content: input}) + if err != nil { + return nil, fmt.Errorf("error marshaling embed data: %w", err) + } + + req := C.CString(string(data)) + defer C.free(unsafe.Pointer(req)) + var resp C.ext_server_resp + err = errWrap(C.llama_server_embedding(req, &resp)) + if resp.json_resp != nil { + defer C.free(unsafe.Pointer(resp.json_resp)) + } + if err != nil { + return nil, err + } + + var embedding EmbeddingResponse + if err := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &embedding); err != nil { + return nil, fmt.Errorf("unmarshal tokenize response: %w", err) + } + + return embedding.Embedding, nil +} + +func (llm *llamaExtServer) Ping(ctx context.Context) error { + // TODO - consider some mechanism to check if the main loop and llama.cpp are in a good state + return nil +} + +func (llm *llamaExtServer) Close() { + C.llama_server_stop() + mutex.Unlock() +} diff --git a/llm/gpu_cuda.go b/llm/gpu_cuda.go new file mode 100644 index 00000000..0afa8e2b --- /dev/null +++ b/llm/gpu_cuda.go @@ -0,0 +1,57 @@ +//go:build linux || windows + +package llm + +import ( + "errors" + "log" + + "github.com/jmorganca/ollama/api" +) + +/* +#cgo windows LDFLAGS: -L"/Program Files/NVIDIA Corporation/NVSMI/" +#cgo linux LDFLAGS: -lnvidia-ml + +#include +#include "examples/server/server.h" +*/ +import "C" + +// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs +func CheckVRAM() (int64, error) { + return int64(C.check_vram()), nil +} + +func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { + if opts.NumGPU != -1 { + return opts.NumGPU + } + freeBytes, err := CheckVRAM() + if err != nil { + if !errors.Is(err, errNvidiaSMI) { + log.Print(err.Error()) + } + // nvidia driver not installed or no nvidia GPU found + return 0 + } + + /* + Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers. + We can store the model weights and the kv cache in vram, + to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file. + */ + bytesPerLayer := fileSizeBytes / numLayer + + // 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors + layers := int(freeBytes/bytesPerLayer) * 3 / 4 + + // TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU + // if int64(layers) < numLayer { + // log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024)) + // return 0 + // } + log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", freeBytes/(1024*1024), layers, numLayer) + + return layers +} diff --git a/llm/gpu_darwin.go b/llm/gpu_darwin.go new file mode 100644 index 00000000..39ee4f75 --- /dev/null +++ b/llm/gpu_darwin.go @@ -0,0 +1,19 @@ +//go:build darwin + +package llm + +import ( + "github.com/jmorganca/ollama/api" +) + +// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs +func CheckVRAM() (int64, error) { + // TODO - assume metal, and return free memory? + return 0, errNvidiaSMI + +} + +func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { + // default to enable metal on macOS + return 1 +} diff --git a/llm/llama.cpp/gen_common.sh b/llm/llama.cpp/gen_common.sh new file mode 100644 index 00000000..f17d19de --- /dev/null +++ b/llm/llama.cpp/gen_common.sh @@ -0,0 +1,34 @@ +# common logic accross linux and darwin + +init_vars() { + PATCHES="0001-Expose-callable-API-for-server.patch" + CMAKE_DEFS="-DLLAMA_ACCELERATE=on" + # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings + CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server" + if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then + CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}" + else + # TODO - add additional optimization flags... + CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release ${CMAKE_DEFS}" + fi +} + +git_module_setup() { + # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo + git submodule init + git submodule update --force gguf + +} + +apply_patches() { + # Workaround git apply not handling creation well for iteration + rm -f gguf/examples/server/server.h + for patch in ${PATCHES} ; do + git -C gguf apply ../patches/${patch} + done +} + +build() { + cmake -S gguf -B ${BUILD_DIR} ${CMAKE_DEFS} + cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 +} \ No newline at end of file diff --git a/llm/llama.cpp/gen_darwin.sh b/llm/llama.cpp/gen_darwin.sh new file mode 100755 index 00000000..448c595b --- /dev/null +++ b/llm/llama.cpp/gen_darwin.sh @@ -0,0 +1,36 @@ +#!/bin/sh +# This script is intended to run inside the go generate +# working directory must be ../llm/llama.cpp + +# TODO - add hardening to detect missing tools (cmake, etc.) + +set -ex +set -o pipefail +echo "Starting darwin generate script" +source $(dirname $0)/gen_common.sh +init_vars +CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 ${CMAKE_DEFS}" +case "${GOARCH}" in + "amd64") + CMAKE_DEFS="-DLLAMA_METAL=off -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 ${CMAKE_DEFS}" + BUILD_DIR="gguf/build/cpu" + ;; + "arm64") + CMAKE_DEFS="-DLLAMA_METAL=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 ${CMAKE_DEFS}" + BUILD_DIR="gguf/build/metal" + ;; + *) + echo "GOARCH must be set" + echo "this script is meant to be run from within go generate" + exit 1 + ;; +esac + +git_module_setup +apply_patches +build + +# Enable local debug/run usecase +if [ -e "gguf/ggml-metal.metal" ]; then + cp gguf/ggml-metal.metal ../../ +fi diff --git a/llm/llama.cpp/gen_linux.sh b/llm/llama.cpp/gen_linux.sh new file mode 100755 index 00000000..c5405dd8 --- /dev/null +++ b/llm/llama.cpp/gen_linux.sh @@ -0,0 +1,17 @@ +#!/bin/sh +# This script is intended to run inside the go generate +# working directory must be ../llm/llama.cpp + +set -ex +set -o pipefail + +# TODO - stopped here - map the variables from above over and refine the case statement below + +echo "Starting linux generate script" +source $(dirname $0)/gen_common.sh +init_vars +CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" +BUILD_DIR="gguf/build/cuda" +git_module_setup +apply_patches +build diff --git a/llm/llama.cpp/gen_windows.ps1 b/llm/llama.cpp/gen_windows.ps1 new file mode 100644 index 00000000..9717b2e7 --- /dev/null +++ b/llm/llama.cpp/gen_windows.ps1 @@ -0,0 +1,51 @@ +#!powershell + +$ErrorActionPreference = "Stop" + +function init_vars { + $script:buildDir="gguf/build/wincuda" + $script:installDir="gguf/build/wincuda/dist" + $script:patches = @("0001-Expose-callable-API-for-server.patch") + $script:cmakeDefs = @("-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-DLLAMA_K_QUANTS=on", "-DLLAMA_ACCELERATE=on", "-DLLAMA_CUBLAS=ON","-DCMAKE_VERBOSE_MAKEFILE=ON","-DBUILD_SHARED_LIBS=on","-A","x64") + + if ($env:CGO_CFLAGS -contains "-g") { + $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on") + $script:config += "RelWithDebInfo" + } else { + $script:config += "Release" + } +} + +function git_module_setup { + # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo + & git submodule init + & git submodule update --force gguf +} + +function apply_patches { + rm -erroraction ignore -path "gguf/examples/server/server.h" + foreach ($patch in $patches) { + write-host "Applying patch $patch" + & git -C gguf apply ../patches/$patch + } +} + +function build { + write-host "generating config with: cmake -S gguf -B $buildDir $cmakeDefs" + & cmake --version + & cmake -S gguf -B $buildDir $cmakeDefs + write-host "building with: cmake --build $buildDir --config $config" + & cmake --build $buildDir --config $config +} + +function install { + rm -erroraction ignore -recurse -force -path $installDir + & cmake --install $buildDir --prefix $installDir --config $config + +} + +init_vars +git_module_setup +apply_patches +build +install \ No newline at end of file diff --git a/llm/llama.cpp/generate_darwin.go b/llm/llama.cpp/generate_darwin.go new file mode 100644 index 00000000..498e5005 --- /dev/null +++ b/llm/llama.cpp/generate_darwin.go @@ -0,0 +1,3 @@ +package llm + +//go:generate sh ./gen_darwin.sh diff --git a/llm/llama.cpp/generate_darwin_amd64.go b/llm/llama.cpp/generate_darwin_amd64.go deleted file mode 100644 index fed45fd9..00000000 --- a/llm/llama.cpp/generate_darwin_amd64.go +++ /dev/null @@ -1,9 +0,0 @@ -package llm - -//go:generate git submodule init - -//go:generate git submodule update --force gguf -//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch -//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on -//go:generate cmake --build gguf/build/cpu --target server --config Release -//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner diff --git a/llm/llama.cpp/generate_darwin_arm64.go b/llm/llama.cpp/generate_darwin_arm64.go deleted file mode 100644 index 0c33bc51..00000000 --- a/llm/llama.cpp/generate_darwin_arm64.go +++ /dev/null @@ -1,9 +0,0 @@ -package llm - -//go:generate git submodule init - -//go:generate git submodule update --force gguf -//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch -//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -//go:generate cmake --build gguf/build/metal --target server --config Release -//go:generate mv gguf/build/metal/bin/server gguf/build/metal/bin/ollama-runner diff --git a/llm/llama.cpp/generate_linux.go b/llm/llama.cpp/generate_linux.go index e67ca21a..6782a614 100644 --- a/llm/llama.cpp/generate_linux.go +++ b/llm/llama.cpp/generate_linux.go @@ -1,14 +1,3 @@ package llm -//go:generate git submodule init - -//go:generate git submodule update --force gguf -//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch -//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch -//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -//go:generate cmake --build gguf/build/cpu --target server --config Release -//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner - -//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0 -//go:generate cmake --build gguf/build/cuda --target server --config Release -//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner +//go:generate sh ./gen_linux.sh diff --git a/llm/llama.cpp/generate_windows.go b/llm/llama.cpp/generate_windows.go index 6cd9566f..87acd827 100644 --- a/llm/llama.cpp/generate_windows.go +++ b/llm/llama.cpp/generate_windows.go @@ -1,17 +1,3 @@ package llm -//go:generate git submodule init - -//go:generate git submodule update --force gguf -//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch -//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -//go:generate cmake --build gguf/build/cpu --target server --config Release -//go:generate cmd /c move gguf\build\cpu\bin\Release\server.exe gguf\build\cpu\bin\Release\ollama-runner.exe - -//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/cuda --target server --config Release -//go:generate cmd /c move ggml\build\cuda\bin\Release\server.exe ggml\build\cuda\bin\Release\ollama-runner.exe - -//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -//go:generate cmake --build gguf/build/cuda --target server --config Release -//go:generate cmd /c move gguf\build\cuda\bin\Release\server.exe gguf\build\cuda\bin\Release\ollama-runner.exe +//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1 diff --git a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch new file mode 100644 index 00000000..838347d5 --- /dev/null +++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch @@ -0,0 +1,422 @@ +From 64b3fbb150d12b3ca63ac2fb4e57bc46f41d2ccd Mon Sep 17 00:00:00 2001 +From: Daniel Hiltgen +Date: Mon, 13 Nov 2023 12:25:58 -0800 +Subject: [PATCH] Expose callable API for server + +This adds an extern "C" interface within the example server +--- + examples/server/CMakeLists.txt | 24 ++++ + examples/server/server.cpp | 247 +++++++++++++++++++++++++++++++++ + examples/server/server.h | 83 +++++++++++ + ggml-cuda.cu | 1 + + 4 files changed, 355 insertions(+) + create mode 100644 examples/server/server.h + +diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt +index 859cd12..4ea47a7 100644 +--- a/examples/server/CMakeLists.txt ++++ b/examples/server/CMakeLists.txt +@@ -11,3 +11,27 @@ if (WIN32) + TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) + endif() + target_compile_features(${TARGET} PRIVATE cxx_std_11) ++ ++set(TARGET ext_server) ++option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) ++add_library(${TARGET} STATIC server.cpp) ++target_include_directories(${TARGET} PRIVATE ../../common) ++target_include_directories(${TARGET} PRIVATE ../..) ++target_compile_features(${TARGET} PRIVATE cxx_std_11) ++target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1) ++target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT}) ++ ++if (BUILD_SHARED_LIBS) ++ set_target_properties(ext_server PROPERTIES POSITION_INDEPENDENT_CODE ON) ++ target_compile_definitions(ext_server PRIVATE LLAMA_SHARED LLAMA_BUILD) ++ add_library(ext_server_shared SHARED $) ++ target_link_libraries(ext_server_shared PRIVATE ggml llama llava common ${CMAKE_THREAD_LIBS_INIT}) ++ install(TARGETS ext_server_shared LIBRARY) ++endif() ++ ++if (CUDAToolkit_FOUND) ++ target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) ++ if (WIN32) ++ target_link_libraries(ext_server_shared PRIVATE nvml) ++ endif() ++endif() +\ No newline at end of file +diff --git a/examples/server/server.cpp b/examples/server/server.cpp +index 895f751..f939590 100644 +--- a/examples/server/server.cpp ++++ b/examples/server/server.cpp +@@ -5,6 +5,9 @@ + #include "../llava/clip.h" + + #include "stb_image.h" ++#if defined(LLAMA_SERVER_LIBRARY) ++#include "server.h" ++#endif + + #ifndef NDEBUG + // crash the server in debug mode, otherwise send an http 500 error +@@ -2631,6 +2634,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con + } + } + ++#ifndef LLAMA_SERVER_LIBRARY + int main(int argc, char **argv) + { + // own arguments required by this example +@@ -3065,3 +3069,246 @@ int main(int argc, char **argv) + llama_backend_free(); + return 0; + } ++ ++#else // LLAMA_SERVER_LIBRARY ++// Expose the llama server as a callable extern "C" API ++llama_server_context llama; ++std::atomic ext_server_running(false); ++std::thread ext_server_thread; ++inline ext_server_err makeErr(uint32_t code, std::string msg) { ++ if (code == 0) { ++ return ext_server_err{0, NULL}; ++ } ++ const std::string::size_type size = msg.size(); ++ ext_server_err ret = { ++ code, ++ new char[size + 1], ++ }; ++ memcpy(ret.err, msg.c_str(), size + 1); ++ return ret; ++} ++ ++ext_server_err llama_server_init(ext_server_params *sparams) ++{ ++ log_set_target(stdout); ++ gpt_params params; ++ params.n_ctx = sparams->n_ctx; ++ params.n_batch = sparams->n_batch; ++ params.n_threads = sparams->n_threads; ++ params.n_parallel = sparams->n_parallel; ++ params.rope_freq_base = sparams->rope_freq_base; ++ params.rope_freq_scale = sparams->rope_freq_scale; ++ ++ if (sparams->memory_f16) { ++ params.cache_type_k = "f16"; ++ params.cache_type_v = "f16"; ++ } else { ++ params.cache_type_k = "f32"; ++ params.cache_type_v = "f32"; ++ } ++ ++ params.n_gpu_layers = sparams->n_gpu_layers; ++ params.main_gpu = sparams->main_gpu; ++ params.use_mlock = sparams->use_mlock; ++ params.use_mmap = sparams->use_mmap; ++ params.numa = sparams->numa; ++ params.embedding = sparams->embedding; ++ if (sparams->model != NULL) { ++ params.model = sparams->model; ++ } ++ ++ for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; la = la->next) { ++ params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale)); ++ } ++ ++ try { ++ llama_backend_init(params.numa); ++ ++ // load the model ++ if (!llama.load_model(params)) ++ { ++ // TODO - consider modifying the logging logic or patching load_model so we can capture more detailed error messages ++ // and pass them back to the caller for better UX ++ return makeErr(1, "error loading model " + params.model); ++ } ++ ++ llama.initialize(); ++ } catch (std::exception &e) { ++ return makeErr(1, e.what()); ++ } catch (...) { ++ return makeErr(1, "Unknown Exception initializing llama server"); ++ } ++ return makeErr(0, ""); ++} ++ ++void llama_server_start() ++{ ++ // TODO mutex to protect thread creation ++ ext_server_thread = std::thread([&]() ++ { ++ ext_server_running = true; ++ try { ++ LOG_TEE("llama server main loop starting\n"); ++ ggml_time_init(); ++ while (ext_server_running.load()) ++ { ++ if (!llama.update_slots()) { ++ LOG_TEE("unexpected error in llama server update_slots - exiting main loop\n"); ++ break; ++ } ++ } ++ } catch (std::exception &e) { ++ LOG_TEE("caught exception in llama server main loop: %s\n", e.what()); ++ } catch (...) { ++ LOG_TEE("caught unknown exception in llama server main loop\n"); ++ } ++ LOG_TEE("\nllama server shutting down\n"); ++ llama_backend_free(); ++ }); ++} ++ ++void llama_server_stop() { ++ // TODO - too verbose, remove once things are solid ++ LOG_TEE("requesting llama server shutdown\n"); ++ ext_server_running = false; ++ ext_server_thread.join(); ++ LOG_TEE("llama server shutdown complete\n"); ++} ++ ++ext_server_completion_resp llama_server_completion(const char *json_req) { ++ std::string msg; ++ ext_server_completion_resp resp = { ++ 0, ++ NULL, ++ }; ++ try { ++ json data = json::parse(json_req); ++ resp.task_id = llama.request_completion(data, false, false, -1); ++ return resp; ++ } catch (std::exception &e) { ++ msg = e.what(); ++ } catch (...) { ++ msg = "Unknown Exception during completion"; ++ } ++ const std::string::size_type size = msg.size(); ++ resp.task_id = 0; ++ resp.err = new char[size + 1]; ++ memcpy(resp.err, msg.c_str(), size + 1); ++ return resp; ++} ++ ++ext_task_result llama_server_completion_next_result(const int task_id) { ++ std::string msg; ++ ext_task_result resp = {-1,false,false,NULL}; ++ try { ++ task_result result = llama.next_result(task_id); ++ std::string result_json = result.result_json.dump(-1, ' ', false, json::error_handler_t::replace); ++ const std::string::size_type size = result_json.size(); ++ resp.id = result.id; ++ resp.stop = result.stop; ++ resp.error = result.error; ++ resp.result_json = new char[size + 1]; ++ memcpy(resp.result_json, result_json.c_str(), size + 1); ++ if (result.error) { ++ llama.request_cancel(task_id); ++ } else if (result.stop) { ++ llama.request_cancel(task_id); ++ } ++ return resp; ++ } catch (std::exception &e) { ++ msg = e.what(); // TODO - json? ++ } catch (...) { ++ msg = "Unknown Exception during completion"; ++ } ++ resp.error = true; ++ const std::string::size_type size = msg.size(); ++ resp.result_json = new char[size + 1]; ++ memcpy(resp.result_json, msg.c_str(), size + 1); ++ return resp; ++} ++ ++ext_server_err llama_server_completion_cancel(const int task_id) { ++ try { ++ llama.request_cancel(task_id); ++ } catch (std::exception &e) { ++ return makeErr(1, e.what()); ++ } catch (...) { ++ return makeErr(1, "Unknown Exception running llama server"); ++ } ++ return makeErr(0, ""); ++} ++ ++ ++ext_server_err llama_server_tokenize(const char *json_req, ext_server_resp *resp) { ++ resp->json_resp = NULL; ++ try { ++ const json body = json::parse(json_req); ++ std::vector tokens; ++ if (body.count("content") != 0) ++ { ++ tokens = llama.tokenize(body["content"], false); ++ } ++ const json data = format_tokenizer_response(tokens); ++ std::string result_json = data.dump(); ++ const std::string::size_type size = result_json.size(); ++ resp->json_resp = new char[size + 1]; ++ memcpy(resp->json_resp, result_json.c_str(), size + 1); ++ } catch (std::exception &e) { ++ return makeErr(1, e.what()); ++ } catch (...) { ++ return makeErr(1, "Unknown Exception during tokenize"); ++ } ++ return makeErr(0, ""); ++} ++ ++ext_server_err llama_server_detokenize(const char *json_req, ext_server_resp *resp) { ++ resp->json_resp = NULL; ++ try { ++ const json body = json::parse(json_req); ++ std::string content; ++ if (body.count("tokens") != 0) ++ { ++ const std::vector tokens = body["tokens"]; ++ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend()); ++ } ++ const json data = format_detokenized_response(content); ++ std::string result_json = data.dump(); ++ const std::string::size_type size = result_json.size(); ++ resp->json_resp = new char[size + 1]; ++ memcpy(resp->json_resp, result_json.c_str(), size + 1); ++ } catch (std::exception &e) { ++ return makeErr(1, e.what()); ++ } catch (...) { ++ return makeErr(1, "Unknown Exception during detokenize"); ++ } ++ return makeErr(0, ""); ++} ++ ++ext_server_err llama_server_embedding(const char *json_req, ext_server_resp *resp) { ++ resp->json_resp = NULL; ++ try { ++ const json body = json::parse(json_req); ++ json prompt; ++ if (body.count("content") != 0) ++ { ++ prompt = body["content"]; ++ } ++ else ++ { ++ prompt = ""; ++ } ++ const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); ++ task_result result = llama.next_result(task_id); ++ std::string result_json = result.result_json.dump(); ++ const std::string::size_type size = result_json.size(); ++ resp->json_resp = new char[size + 1]; ++ memcpy(resp->json_resp, result_json.c_str(), size + 1); ++ } catch (std::exception &e) { ++ return makeErr(1, e.what()); ++ } catch (...) { ++ return makeErr(1, "Unknown Exception during detokenize"); ++ } ++ return makeErr(0, ""); ++} ++ ++#endif // LLAMA_SERVER_LIBRARY +\ No newline at end of file +diff --git a/examples/server/server.h b/examples/server/server.h +new file mode 100644 +index 0000000..4d03b1e +--- /dev/null ++++ b/examples/server/server.h +@@ -0,0 +1,83 @@ ++#if defined(LLAMA_SERVER_LIBRARY) ++#ifndef LLAMA_SERVER_H ++#define LLAMA_SERVER_H ++#include ++#include ++#include ++#include ++ ++// This exposes extern C entrypoints into the llama_server ++// To enable the server compile with LLAMA_SERVER_LIBRARY ++ ++#ifdef __cplusplus ++extern "C" ++{ ++#endif ++ // TODO - clean the type def's up a bit for better consistency ++ typedef struct ext_server_err { ++ uint32_t code; // 0 on success, > 0 on error ++ char *err; // null if code == 0; else contains error message. Caller responsible for freeing memory ++ } ext_server_err; ++ ++ typedef struct ext_server_lora_adapter { ++ char *adapter; ++ float scale; ++ struct ext_server_lora_adapter *next; ++ } ext_server_lora_adapter; ++ typedef struct ext_server_params ++ { ++ char *model; ++ uint32_t n_ctx; // text context, 0 = from model ++ uint32_t n_batch; // prompt processing maximum batch size ++ uint32_t n_threads; // number of threads to use for generation ++ int32_t n_parallel; // number of parallel sequences to decodewra ++ float rope_freq_base; // RoPE base frequency, 0 = from model ++ float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model ++ bool memory_f16; // use f16 instead of f32 for memory kv ++ int32_t n_gpu_layers; // number of layers to store in VRAM (-1 - use default) ++ int32_t main_gpu; // the GPU that is used for scratch and small tensors ++ bool use_mlock; // force system to keep model in RAM ++ bool use_mmap; // use mmap if possible ++ bool numa; // attempt optimizations that help on some NUMA systems ++ bool embedding; // get only sentence embedding ++ ext_server_lora_adapter* lora_adapters; ++ } ext_server_params; ++ ++ // Initialize the server once per process ++ ext_server_err llama_server_init(ext_server_params *sparams); ++ ++ // Run the main loop ++ void llama_server_start(); ++ // Stop the main loop ++ void llama_server_stop(); ++ ++ typedef struct ext_task_result ++ { ++ int id; ++ bool stop; ++ bool error; ++ char* result_json; // caller responsible to free this memory ++ } ext_task_result; ++ ++ typedef struct ext_server_completion_resp { ++ int task_id; // < 0 on error, >= 0 on success ++ char *err; // null if task_id >= 0; else contains error message. Caller responsible for freeing memory ++ } ext_server_completion_resp; ++ ext_server_completion_resp llama_server_completion(const char *json_req); ++ ext_task_result llama_server_completion_next_result(const int task_id); ++ ext_server_err llama_server_completion_cancel(const int task_id); ++ ++ // Caller responsible for freeing json_resp ++ typedef struct ext_server_resp { ++ char *json_resp; // Caller responsible for freeing string ++ } ext_server_resp; ++ ext_server_err llama_server_tokenize(const char *json_req, ext_server_resp *resp); ++ ext_server_err llama_server_detokenize(const char *json_req, ext_server_resp *resp); ++ ext_server_err llama_server_embedding(const char *json_req, ext_server_resp *resp); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif ++#endif // LLAMA_SERVER_LIBRARY +\ No newline at end of file +diff --git a/ggml-cuda.cu b/ggml-cuda.cu +index 85f7a29..ce51364 100644 +--- a/ggml-cuda.cu ++++ b/ggml-cuda.cu +@@ -6410,6 +6410,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( + CUDA_CHECK(cudaGetDevice(&id)); + src_ptr = (char *) extra->data_device[id]; + } else { ++ fprintf(stderr, "ggml_cuda_cpy_tensor_2d assert: backend: %d\n", src->backend); + GGML_ASSERT(false); + } + char * dst_ptr = (char *) dst; +-- +2.39.3 (Apple Git-145) + diff --git a/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch b/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch deleted file mode 100644 index 1fd07973..00000000 --- a/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch +++ /dev/null @@ -1,27 +0,0 @@ -From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001 -From: Michael Yang -Date: Wed, 20 Sep 2023 14:19:52 -0700 -Subject: [PATCH] copy cuda runtime libraries - ---- - CMakeLists.txt | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 824d9f2..dd24137 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) - endif() - -+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY) -+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY) -+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY) -+ - if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - # 52 == lowest CUDA 12 standard - # 60 == f16 CUDA intrinsics --- -2.42.0 - diff --git a/llm/llama.cpp/patches/0001-update-default-log-target.patch b/llm/llama.cpp/patches/0001-update-default-log-target.patch deleted file mode 100644 index 568ca716..00000000 --- a/llm/llama.cpp/patches/0001-update-default-log-target.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 6465fec6290f0a7f5d4d0fbe6bcf634e4810dde6 Mon Sep 17 00:00:00 2001 -From: Michael Yang -Date: Mon, 23 Oct 2023 10:39:34 -0700 -Subject: [PATCH] default log stderr - ---- - common/log.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/common/log.h b/common/log.h -index b8953fd..25522cd 100644 ---- a/common/log.h -+++ b/common/log.h -@@ -90,7 +90,7 @@ - // } - // - #ifndef LOG_TARGET -- #define LOG_TARGET log_handler() -+ #define LOG_TARGET nullptr - #endif - - #ifndef LOG_TEE_TARGET --- -2.42.0 - diff --git a/llm/llama.go b/llm/llama.go index 574c24ff..b3c57d47 100644 --- a/llm/llama.go +++ b/llm/llama.go @@ -1,25 +1,12 @@ package llm import ( - "bufio" "bytes" "context" - "embed" - "encoding/json" "errors" "fmt" - "io" - "io/fs" - "log" - "math/rand" - "net/http" "os" "os/exec" - "path" - "path/filepath" - "runtime" - "strconv" - "strings" "sync" "time" @@ -55,107 +42,6 @@ number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws ws ::= ([ \t\n] ws)? ` -//go:embed llama.cpp/*/build/*/bin/* -var llamaCppEmbed embed.FS - -type ModelRunner struct { - Path string // path to the model runner executable - Accelerated bool -} - -func chooseRunners(workDir string) []ModelRunner { - buildPath := path.Join("llama.cpp", "gguf", "build") - var runners []ModelRunner - - // set the runners based on the OS - // IMPORTANT: the order of the runners in the array is the priority order - switch runtime.GOOS { - case "darwin": - if runtime.GOARCH == "arm64" { - runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}} - } else { - runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}} - } - case "linux": - runners = []ModelRunner{ - {Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true}, - {Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}, - } - case "windows": - // TODO: select windows GPU runner here when available - runners = []ModelRunner{ - {Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true}, - {Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")}, - } - default: - log.Printf("unknown OS, running on CPU: %s", runtime.GOOS) - runners = []ModelRunner{ - {Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}, - } - } - - runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail - for _, r := range runners { - // find all the files in the runner's bin directory - files, err := fs.Glob(llamaCppEmbed, path.Join(path.Dir(r.Path), "*")) - if err != nil { - // this is expected, ollama may be compiled without all runners packed in - log.Printf("%s runner not found: %v", r.Path, err) - continue - } - - for _, f := range files { - runnerAvailable = true - - srcFile, err := llamaCppEmbed.Open(f) - if err != nil { - log.Fatalf("read llama runner %s: %v", f, err) - } - defer srcFile.Close() - - // create the directory in case it does not exist, filepath.Dir() converts the file path to the OS's format - destPath := filepath.Join(workDir, filepath.Dir(f)) - if err := os.MkdirAll(destPath, 0o755); err != nil { - log.Fatalf("create runner temp dir %s: %v", filepath.Dir(f), err) - } - - // create the path to the destination file, filepath.Base() converts the file path to the OS's format - destFile := filepath.Join(destPath, filepath.Base(f)) - - _, err = os.Stat(destFile) - switch { - case errors.Is(err, os.ErrNotExist): - destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) - if err != nil { - log.Fatalf("write llama runner %s: %v", f, err) - } - defer destFile.Close() - - if _, err := io.Copy(destFile, srcFile); err != nil { - log.Fatalf("copy llama runner %s: %v", f, err) - } - case err != nil: - log.Fatalf("stat llama runner %s: %v", f, err) - } - } - } - if !runnerAvailable { - log.Fatalf("gguf runner not found") - } - - // return the runners to try in priority order - localRunnersByPriority := []ModelRunner{} - for _, r := range runners { - // clean the ModelRunner paths so that they match the OS we are running on - localRunnersByPriority = append(localRunnersByPriority, ModelRunner{ - Path: filepath.Clean(path.Join(workDir, r.Path)), - Accelerated: r.Accelerated, - }) - } - - return localRunnersByPriority -} - type llamaModel struct { hyperparameters llamaHyperparameters } @@ -237,72 +123,6 @@ var ( errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only") ) -// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs -func CheckVRAM() (int64, error) { - cmd := exec.Command("nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits") - var stdout bytes.Buffer - cmd.Stdout = &stdout - err := cmd.Run() - if err != nil { - return 0, errNvidiaSMI - } - - var freeMiB int64 - scanner := bufio.NewScanner(&stdout) - for scanner.Scan() { - line := scanner.Text() - if strings.Contains(line, "[Insufficient Permissions]") { - return 0, fmt.Errorf("GPU support may not enabled, check you have installed GPU drivers and have the necessary permissions to run nvidia-smi") - } - - vram, err := strconv.ParseInt(strings.TrimSpace(line), 10, 64) - if err != nil { - return 0, fmt.Errorf("failed to parse available VRAM: %v", err) - } - - freeMiB += vram - } - - freeBytes := freeMiB * 1024 * 1024 - if freeBytes < 2*format.GigaByte { - log.Printf("less than 2 GB VRAM available") - return 0, errAvailableVRAM - } - - return freeBytes, nil -} - -func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { - if opts.NumGPU != -1 { - return opts.NumGPU - } - if runtime.GOOS == "linux" || runtime.GOOS == "windows" { - freeBytes, err := CheckVRAM() - if err != nil { - if !errors.Is(err, errNvidiaSMI) { - log.Print(err.Error()) - } - // nvidia driver not installed or no nvidia GPU found - return 0 - } - - /* - Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers. - We can store the model weights and the kv cache in vram, - to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file. - */ - bytesPerLayer := fileSizeBytes / numLayer - - // 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors - layers := int(freeBytes/bytesPerLayer) * 3 / 4 - log.Printf("%d MB VRAM available, loading up to %d GPU layers", freeBytes/(1024*1024), layers) - - return layers - } - // default to enable metal on macOS - return 1 -} - // StatusWriter is a writer that captures error messages from the llama runner process type StatusWriter struct { ErrCh chan error @@ -331,204 +151,6 @@ func (w *StatusWriter) Write(b []byte) (int, error) { return os.Stderr.Write(b) } -func newLlama(model string, adapters, projectors []string, runners []ModelRunner, numLayers int64, opts api.Options) (*llama, error) { - fileInfo, err := os.Stat(model) - if err != nil { - return nil, err - } - - if len(adapters) > 1 { - return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") - } - - numGPU := NumGPU(numLayers, fileInfo.Size(), opts) - params := []string{ - "--model", model, - "--ctx-size", fmt.Sprintf("%d", opts.NumCtx), - "--batch-size", fmt.Sprintf("%d", opts.NumBatch), - "--n-gpu-layers", fmt.Sprintf("%d", numGPU), - "--embedding", - "--parallel", "2", - } - - if opts.MainGPU > 0 { - params = append(params, "--main-gpu", fmt.Sprintf("%d", opts.MainGPU)) - } - - if opts.RopeFrequencyBase > 0 { - params = append(params, "--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase)) - } - - if opts.RopeFrequencyScale > 0 { - params = append(params, "--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale)) - } - - if opts.NumGQA > 0 { - params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA)) - } - - if len(adapters) > 0 { - // TODO: applying multiple adapters is not supported by the llama.cpp server yet - params = append(params, "--lora", adapters[0]) - } - - if len(projectors) > 0 { - // TODO: applying multiple projectors is not supported by the llama.cpp server yet - params = append(params, "--mmproj", projectors[0]) - } - - if opts.NumThread > 0 { - params = append(params, "--threads", fmt.Sprintf("%d", opts.NumThread)) - } - - if !opts.F16KV { - params = append(params, "--memory-f32") - } - if opts.UseMLock { - params = append(params, "--mlock") - } - if !opts.UseMMap { - params = append(params, "--no-mmap") - } - if opts.UseNUMA { - params = append(params, "--numa") - } - - var runnerErr error - - // start the llama.cpp server with a retry in case the port is already in use - for _, runner := range runners { - if runner.Accelerated && numGPU == 0 { - log.Printf("skipping accelerated runner because num_gpu=0") - continue - } - - if _, err := os.Stat(runner.Path); err != nil { - log.Printf("llama runner not found: %v", err) - continue - } - - port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range - params := append(params, "--port", strconv.Itoa(port)) - - ctx, cancel := context.WithCancel(context.Background()) - cmd := exec.CommandContext( - ctx, - runner.Path, - params..., - ) - - var libraryPaths []string - if libraryPath, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok { - libraryPaths = append(libraryPaths, libraryPath) - } - - libraryPaths = append(libraryPaths, filepath.Dir(runner.Path)) - - cmd.Env = append(os.Environ(), fmt.Sprintf("LD_LIBRARY_PATH=%s", strings.Join(libraryPaths, ":"))) - cmd.Stdout = os.Stderr - statusWriter := NewStatusWriter() - cmd.Stderr = statusWriter - - llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel, exitCh: make(chan error)}} - - log.Print("starting llama runner") - if err := llm.Cmd.Start(); err != nil { - log.Printf("error starting the external llama runner: %v", err) - continue - } - - // monitor the llama runner process and signal when it exits - go func() { - err := llm.Cmd.Wait() - // default to printing the exit message of the command process, it will probably just say 'exit staus 1' - errMsg := err.Error() - // try to set a better error message if llama runner logs captured an error - if statusWriter.LastErrMsg != "" { - errMsg = statusWriter.LastErrMsg - } - log.Println(errMsg) - // llm.Cmd.Wait() can only be called once, use this exit channel to signal that the process has exited - llm.exitOnce.Do(func() { - close(llm.exitCh) - }) - }() - - if err := waitForServer(llm); err != nil { - log.Printf("error starting llama runner: %v", err) - llm.Close() - - // default the runnerErr to the error returned by the most recent llama runner process - runnerErr = err - - // capture the error directly from the runner process, if any - select { - case runnerErr = <-statusWriter.ErrCh: - default: - // the runner process probably timed out - } - - // try again - continue - } - - // server started successfully - return llm, nil - } - - if runnerErr != nil { - // this is the error returned from the llama runner process that failed most recently - return nil, runnerErr - } - - return nil, fmt.Errorf("failed to start a llama runner") -} - -func waitForServer(llm *llama) error { - start := time.Now() - expiresAt := time.Now().Add(3 * time.Minute) // be generous with timeout, large models can take a while to load - ticker := time.NewTicker(200 * time.Millisecond) - defer ticker.Stop() - - log.Print("waiting for llama runner to start responding") - for { - select { - case <-llm.exitCh: - // failed to start subprocess - return fmt.Errorf("llama runner process has terminated") - case <-ticker.C: - if time.Now().After(expiresAt) { - // timeout - return fmt.Errorf("timed out waiting for llama runner to start") - } - - if err := llm.Ping(context.Background()); err == nil { - // success - log.Printf("llama runner started in %f seconds", time.Since(start).Seconds()) - return nil - } - } - } -} - -func (llm *llama) Close() { - // signal the sub-process to terminate - llm.Cancel() - - // wait for the command to exit to prevent race conditions with the next run - <-llm.exitCh - - if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" { - log.Printf("llama runner stopped with error: %v", llm.StatusWriter.LastErrMsg) - } else { - log.Print("llama runner stopped successfully") - } -} - -func (llm *llama) SetOptions(opts api.Options) { - llm.Options = opts -} - type prediction struct { Content string `json:"content"` Model string `json:"model"` @@ -561,158 +183,6 @@ type PredictResult struct { EvalDuration time.Duration } -// IsRetryable checks if the line matches a condition that can be retried -func isRetryable(line []byte) bool { - return bytes.Contains(line, []byte("slot unavailable")) -} - -func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error { - imageData := llm.ImageData - if len(predict.Images) > 0 { - for cnt, i := range predict.Images { - imageData = append(imageData, ImageData{Data: i, ID: cnt}) - } - } - log.Printf("loaded %d images", len(imageData)) - - request := map[string]any{ - "prompt": predict.Prompt, - "stream": true, - "n_predict": llm.NumPredict, - "n_keep": llm.NumKeep, - "main_gpu": llm.MainGPU, - "temperature": llm.Temperature, - "top_k": llm.TopK, - "top_p": llm.TopP, - "tfs_z": llm.TFSZ, - "typical_p": llm.TypicalP, - "repeat_last_n": llm.RepeatLastN, - "repeat_penalty": llm.RepeatPenalty, - "presence_penalty": llm.PresencePenalty, - "frequency_penalty": llm.FrequencyPenalty, - "mirostat": llm.Mirostat, - "mirostat_tau": llm.MirostatTau, - "mirostat_eta": llm.MirostatEta, - "penalize_nl": llm.PenalizeNewline, - "seed": llm.Seed, - "stop": llm.Stop, - "image_data": imageData, - } - - if predict.Format == "json" { - request["grammar"] = jsonGrammar - } - - retryDelay := 100 * time.Microsecond - for retries := 0; retries < maxRetries; retries++ { - if retries > 0 { - time.Sleep(retryDelay) // wait before retrying - retryDelay *= 2 // exponential backoff - } - - // Handling JSON marshaling with special characters unescaped. - buffer := &bytes.Buffer{} - enc := json.NewEncoder(buffer) - enc.SetEscapeHTML(false) - - if err := enc.Encode(request); err != nil { - return fmt.Errorf("failed to marshal data: %v", err) - } - - endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port) - req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer) - if err != nil { - return fmt.Errorf("error creating POST request: %v", err) - } - req.Header.Set("Content-Type", "application/json") - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return fmt.Errorf("POST predict: %v", err) - } - defer resp.Body.Close() - - if resp.StatusCode >= 400 { - bodyBytes, err := io.ReadAll(resp.Body) - if err != nil { - return fmt.Errorf("failed reading llm error response: %w", err) - } - log.Printf("llm predict error: %s", bodyBytes) - return fmt.Errorf("%s", bodyBytes) - } - - scanner := bufio.NewScanner(resp.Body) - // increase the buffer size to avoid running out of space - buf := make([]byte, 0, maxBufferSize) - scanner.Buffer(buf, maxBufferSize) - - retryNeeded := false - for scanner.Scan() { - select { - case <-ctx.Done(): - // This handles the request cancellation - return ctx.Err() - default: - line := scanner.Bytes() - if len(line) == 0 { - continue - } - - if isRetryable(line) { - retryNeeded = true - break - } - - evt, ok := bytes.CutPrefix(line, []byte("data: ")) - if !ok { - return fmt.Errorf("error parsing llm response stream: %s", line) - } - - var p prediction - if err := json.Unmarshal(evt, &p); err != nil { - return fmt.Errorf("error unmarshaling llm prediction response: %v", err) - } - - if p.Content != "" { - fn(PredictResult{ - Content: p.Content, - }) - } - - if p.Stop { - fn(PredictResult{ - Done: true, - PromptEvalCount: p.Timings.PromptN, - PromptEvalDuration: parseDurationMs(p.Timings.PromptMS), - EvalCount: p.Timings.PredictedN, - EvalDuration: parseDurationMs(p.Timings.PredictedMS), - }) - return nil - } - } - } - - if err := scanner.Err(); err != nil { - if strings.Contains(err.Error(), "unexpected EOF") { - // this means the llama runner subprocess crashed - llm.Close() - if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" { - return fmt.Errorf("llama runner exited: %v", llm.StatusWriter.LastErrMsg) - } - return fmt.Errorf("llama runner exited, you may not have enough available memory to run this model") - } - return fmt.Errorf("error reading llm response: %v", err) - } - - if !retryNeeded { - return nil // success - } - } - - // should never reach here ideally - return fmt.Errorf("max retries exceeded") -} - type TokenizeRequest struct { Content string `json:"content"` } @@ -721,43 +191,6 @@ type TokenizeResponse struct { Tokens []int `json:"tokens"` } -func (llm *llama) Encode(ctx context.Context, prompt string) ([]int, error) { - endpoint := fmt.Sprintf("http://127.0.0.1:%d/tokenize", llm.Port) - data, err := json.Marshal(TokenizeRequest{Content: prompt}) - if err != nil { - return nil, fmt.Errorf("marshaling encode data: %w", err) - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data)) - if err != nil { - return nil, fmt.Errorf("encode request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return nil, fmt.Errorf("do encode request: %w", err) - } - defer resp.Body.Close() - - body, err := io.ReadAll(resp.Body) - if err != nil { - return nil, fmt.Errorf("read encode request: %w", err) - } - - if resp.StatusCode >= 400 { - log.Printf("llm encode error: %s", body) - return nil, fmt.Errorf("%s", body) - } - - var encoded TokenizeResponse - if err := json.Unmarshal(body, &encoded); err != nil { - return nil, fmt.Errorf("unmarshal encode response: %w", err) - } - - return encoded.Tokens, nil -} - type DetokenizeRequest struct { Tokens []int `json:"tokens"` } @@ -766,46 +199,6 @@ type DetokenizeResponse struct { Content string `json:"content"` } -func (llm *llama) Decode(ctx context.Context, tokens []int) (string, error) { - if len(tokens) == 0 { - return "", nil - } - endpoint := fmt.Sprintf("http://127.0.0.1:%d/detokenize", llm.Port) - data, err := json.Marshal(DetokenizeRequest{Tokens: tokens}) - if err != nil { - return "", fmt.Errorf("marshaling decode data: %w", err) - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data)) - if err != nil { - return "", fmt.Errorf("decode request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return "", fmt.Errorf("do decode request: %w", err) - } - defer resp.Body.Close() - - body, err := io.ReadAll(resp.Body) - if err != nil { - return "", fmt.Errorf("read decode request: %w", err) - } - - if resp.StatusCode >= 400 { - log.Printf("llm decode error: %s", body) - return "", fmt.Errorf("%s", body) - } - - var decoded DetokenizeResponse - if err := json.Unmarshal(body, &decoded); err != nil { - return "", fmt.Errorf("unmarshal encode response: %w", err) - } - - return decoded.Content, nil -} - type EmbeddingRequest struct { Content string `json:"content"` } @@ -813,52 +206,3 @@ type EmbeddingRequest struct { type EmbeddingResponse struct { Embedding []float64 `json:"embedding"` } - -func (llm *llama) Embedding(ctx context.Context, input string) ([]float64, error) { - endpoint := fmt.Sprintf("http://127.0.0.1:%d/embedding", llm.Port) - data, err := json.Marshal(TokenizeRequest{Content: input}) - if err != nil { - return nil, fmt.Errorf("error marshaling embed data: %w", err) - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data)) - if err != nil { - return nil, fmt.Errorf("error creating embed request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return nil, fmt.Errorf("POST embedding: %w", err) - } - defer resp.Body.Close() - - body, err := io.ReadAll(resp.Body) - if err != nil { - return nil, fmt.Errorf("error reading embed response: %w", err) - } - - if resp.StatusCode >= 400 { - log.Printf("llm encode error: %s", body) - return nil, fmt.Errorf("%s", body) - } - - var embedding EmbeddingResponse - if err := json.Unmarshal(body, &embedding); err != nil { - return nil, fmt.Errorf("unmarshal tokenize response: %w", err) - } - - return embedding.Embedding, nil -} - -// Ping checks that the server subprocess is still running and responding to requests -func (llm *llama) Ping(ctx context.Context) error { - resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Port)) - if err != nil { - return fmt.Errorf("ping resp: %w", err) - } - if resp.StatusCode != http.StatusOK { - return fmt.Errorf("unexpected ping status: %s", resp.Status) - } - return nil -} diff --git a/llm/llm.go b/llm/llm.go index 92bffd76..41724d35 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -18,7 +18,6 @@ type LLM interface { Embedding(context.Context, string) ([]float64, error) Encode(context.Context, string) ([]int, error) Decode(context.Context, []int) (string, error) - SetOptions(api.Options) Close() Ping(context.Context) error } @@ -79,5 +78,5 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) opts.NumGQA = 0 opts.RopeFrequencyBase = 0.0 opts.RopeFrequencyScale = 0.0 - return newLlama(model, adapters, projectors, chooseRunners(workDir), ggml.NumLayers(), opts) + return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts) } diff --git a/scripts/build_darwin.sh b/scripts/build_darwin.sh index c35a3d8d..b37f6d0e 100755 --- a/scripts/build_darwin.sh +++ b/scripts/build_darwin.sh @@ -9,7 +9,7 @@ mkdir -p dist for TARGETARCH in arm64 amd64; do GOOS=darwin GOARCH=$TARGETARCH go generate ./... - GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH + CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH rm -rf llm/llama.cpp/*/build done diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 0a1099fc..20b44bf7 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -7,7 +7,7 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version mkdir -p dist -for TARGETARCH in arm64 amd64; do +for TARGETARCH in amd64 arm64; do docker buildx build --load --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH . docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH diff --git a/scripts/setup_integration_tests.sh b/scripts/setup_integration_tests.sh new file mode 100755 index 00000000..a1d01ac1 --- /dev/null +++ b/scripts/setup_integration_tests.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# This script sets up integration tests which run the full stack to verify +# inference locally +set -e +set -o pipefail + +REPO=$(dirname $0)/../ +export OLLAMA_MODELS=${REPO}/test_data/models +REGISTRY_SCHEME=https +REGISTRY=registry.ollama.ai +TEST_MODEL=library/orca-mini +TEST_MODEL_TAG=latest +ACCEPT_HEADER="Accept: application/vnd.docker.distribution.manifest.v2+json" + +mkdir -p ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/ +mkdir -p ${OLLAMA_MODELS}/blobs/ + +echo "Pulling manifest for ${TEST_MODEL}:${TEST_MODEL_TAG}" +curl -s --header "${ACCEPT_HEADER}" \ + -o ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} \ + ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/manifests/${TEST_MODEL_TAG} + +CFG_HASH=$(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} | jq -r ".config.digest") +echo "Pulling config blob ${CFG_HASH}" +curl -L -C - --header "${ACCEPT_HEADER}" \ + -o ${OLLAMA_MODELS}/blobs/${CFG_HASH} \ + ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${CFG_HASH} + +for LAYER in $(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} | jq -r ".layers[].digest" ) ; do + echo "Pulling blob ${LAYER}" + curl -L -C - --header "${ACCEPT_HEADER}" \ + -o ${OLLAMA_MODELS}/blobs/${LAYER} \ + ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${LAYER} +done \ No newline at end of file diff --git a/server/llm_test.go b/server/llm_test.go new file mode 100644 index 00000000..167c5831 --- /dev/null +++ b/server/llm_test.go @@ -0,0 +1,103 @@ +package server + +import ( + "context" + "strings" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" + + "github.com/jmorganca/ollama/api" +) + +// TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server +// package to avoid circular dependencies + +// WARNING - these tests will fail on mac if you don't manually copy ggml-metal.metal to this dir (./server) +// +// TODO - Fix this ^^ + +var ( + req = [2]api.GenerateRequest{ + { + Model: "orca-mini", + Prompt: "tell me a short story about agi?", + Options: map[string]interface{}{}, + }, { + Model: "orca-mini", + Prompt: "what is the origin of the us thanksgiving holiday?", + Options: map[string]interface{}{}, + }, + } + resp = [2]string{ + "once upon a time", + "fourth thursday", + } +) + +func TestIntegrationSimpleOrcaMini(t *testing.T) { + SkipIFNoTestData(t) + ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) + defer cancel() + opts := api.DefaultOptions() + opts.Seed = 42 + opts.Temperature = 0.0 + model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts) + defer llmRunner.Close() + response := OneShotPromptResponse(t, ctx, req[0], model, llmRunner) + assert.Contains(t, strings.ToLower(response), resp[0]) +} + +// TODO +// The server always loads a new runner and closes the old one, which forces serial execution +// At present this test case fails with concurrency problems. Eventually we should try to +// get true concurrency working with n_parallel support in the backend +func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { + SkipIFNoTestData(t) + t.Skip("concurrent prediction on single runner not currently supported") + ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) + defer cancel() + opts := api.DefaultOptions() + opts.Seed = 42 + opts.Temperature = 0.0 + var wg sync.WaitGroup + wg.Add(len(req)) + model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts) + defer llmRunner.Close() + for i := 0; i < len(req); i++ { + go func(i int) { + defer wg.Done() + response := OneShotPromptResponse(t, ctx, req[i], model, llmRunner) + t.Logf("Prompt: %s\nResponse: %s", req[0].Prompt, response) + assert.Contains(t, strings.ToLower(response), resp[i], "error in thread %d (%s)", i, req[i].Prompt) + }(i) + } + wg.Wait() +} + +func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) { + SkipIFNoTestData(t) + ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) + defer cancel() + opts := api.DefaultOptions() + opts.Seed = 42 + opts.Temperature = 0.0 + var wg sync.WaitGroup + wg.Add(len(req)) + + for i := 0; i < len(req); i++ { + go func(i int) { + defer wg.Done() + model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts) + defer llmRunner.Close() + response := OneShotPromptResponse(t, ctx, req[i], model, llmRunner) + t.Logf("Prompt: %s\nResponse: %s", req[0].Prompt, response) + assert.Contains(t, strings.ToLower(response), resp[i], "error in thread %d (%s)", i, req[i].Prompt) + }(i) + } + wg.Wait() +} + +// TODO - create a parallel test with 2 different models once we support concurrency diff --git a/server/llm_utils_test.go b/server/llm_utils_test.go new file mode 100644 index 00000000..592fac25 --- /dev/null +++ b/server/llm_utils_test.go @@ -0,0 +1,76 @@ +package server + +import ( + "context" + "errors" + "os" + "path" + "runtime" + "testing" + "time" + + "github.com/jmorganca/ollama/api" + "github.com/jmorganca/ollama/llm" + "github.com/stretchr/testify/require" +) + +func SkipIFNoTestData(t *testing.T) { + modelDir := getModelDir() + if _, err := os.Stat(modelDir); errors.Is(err, os.ErrNotExist) { + t.Skipf("%s does not exist - skipping integration tests", modelDir) + } +} + +func getModelDir() string { + _, filename, _, _ := runtime.Caller(0) + return path.Dir(path.Dir(filename) + "/../test_data/models/.") +} + +func PrepareModelForPrompts(t *testing.T, modelName string, opts api.Options) (*Model, llm.LLM) { + modelDir := getModelDir() + os.Setenv("OLLAMA_MODELS", modelDir) + model, err := GetModel(modelName) + require.NoError(t, err, "GetModel ") + err = opts.FromMap(model.Options) + require.NoError(t, err, "opts from model ") + runner, err := llm.New("unused", model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts) + require.NoError(t, err, "llm.New failed") + return model, runner +} + +func OneShotPromptResponse(t *testing.T, ctx context.Context, req api.GenerateRequest, model *Model, runner llm.LLM) string { + checkpointStart := time.Now() + prompt, err := model.Prompt(PromptVars{ + System: req.System, + Prompt: req.Prompt, + First: len(req.Context) == 0, + }) + require.NoError(t, err, "prompt generation failed") + success := make(chan bool, 1) + response := "" + cb := func(r llm.PredictResult) { + + if !r.Done { + response += r.Content + } else { + success <- true + } + } + checkpointLoaded := time.Now() + predictReq := llm.PredictOpts{ + Prompt: prompt, + Format: req.Format, + CheckpointStart: checkpointStart, + CheckpointLoaded: checkpointLoaded, + } + err = runner.Predict(ctx, predictReq, cb) + require.NoError(t, err, "predict call failed") + + select { + case <-ctx.Done(): + t.Errorf("failed to complete before timeout: \n%s", response) + return "" + case <-success: + return response + } +} diff --git a/server/routes.go b/server/routes.go index 12213606..26a02cc1 100644 --- a/server/routes.go +++ b/server/routes.go @@ -126,10 +126,6 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess loaded.Options = &opts } - // update options for the loaded llm - // TODO(mxyng): this isn't thread safe, but it should be fine for now - loaded.runner.SetOptions(opts) - loaded.expireAt = time.Now().Add(sessionDuration) if loaded.expireTimer == nil { From f8ef4439e9673c7df2314fafb5975aeab856c51f Mon Sep 17 00:00:00 2001 From: 65a <65a.invalid> Date: Mon, 16 Oct 2023 17:41:40 -0700 Subject: [PATCH 04/19] Use build tags to generate accelerated binaries for CUDA and ROCm on Linux. The build tags rocm or cuda must be specified to both go generate and go build. ROCm builds should have both ROCM_PATH set (and the ROCM SDK present) as well as CLBlast installed (for GGML) and CLBlast_DIR set in the environment to the CLBlast cmake directory (likely /usr/lib/cmake/CLBlast). Build tags are also used to switch VRAM detection between cuda and rocm implementations, using added "accelerator_foo.go" files which contain architecture specific functions and variables. accelerator_none is used when no tags are set, and a helper function addRunner will ignore it if it is the chosen accelerator. Fix go generate commands, thanks @deadmeu for testing. --- Dockerfile | 6 +- Dockerfile.build | 4 +- README.md | 35 +++++++++++- llm/accelerator_cuda.go | 67 ++++++++++++++++++++++ llm/accelerator_none.go | 21 +++++++ llm/accelerator_rocm.go | 85 ++++++++++++++++++++++++++++ llm/llama.cpp/generate_linux_cuda.go | 24 ++++++++ llm/llama.cpp/generate_linux_rocm.go | 25 ++++++++ 8 files changed, 261 insertions(+), 6 deletions(-) create mode 100644 llm/accelerator_cuda.go create mode 100644 llm/accelerator_none.go create mode 100644 llm/accelerator_rocm.go create mode 100644 llm/llama.cpp/generate_linux_cuda.go create mode 100644 llm/llama.cpp/generate_linux_rocm.go diff --git a/Dockerfile b/Dockerfile index c50665b6..7c882852 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,8 +11,8 @@ RUN mkdir -p /usr/local && tar xz -C /usr/local totalBiggestCard { + totalBiggestCard = possible + bigCardName = record[0] + } + } + if totalBiggestCard == 0 { + log.Printf("found ROCm GPU but failed to parse free VRAM!") + return 0, errNoAccel + } + log.Printf("ROCm selecting device %q", bigCardName) + return totalBiggestCard, nil +} diff --git a/llm/llama.cpp/generate_linux_cuda.go b/llm/llama.cpp/generate_linux_cuda.go new file mode 100644 index 00000000..86a95977 --- /dev/null +++ b/llm/llama.cpp/generate_linux_cuda.go @@ -0,0 +1,24 @@ +//go:build cuda + +package llm + +//go:generate git submodule init + +//go:generate git submodule update --force ggml +//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch +//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch +//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch +//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch + +//go:generate rm -rf ggml/build/cuda +//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on +//go:generate cmake --build ggml/build/cuda --target server --config Release +//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner + +//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch +//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch + +//go:generate rm -rf gguf/build/cuda +//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off +//go:generate cmake --build gguf/build/cuda --target server --config Release +//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner diff --git a/llm/llama.cpp/generate_linux_rocm.go b/llm/llama.cpp/generate_linux_rocm.go new file mode 100644 index 00000000..1766be84 --- /dev/null +++ b/llm/llama.cpp/generate_linux_rocm.go @@ -0,0 +1,25 @@ +//go:build rocm + +package llm + +//go:generate git submodule init + +//go:generate git submodule update --force ggml +//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch +//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch +//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch +//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch + +//go:generate git submodule update --force gguf +//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch +//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch + +//go:generate rm -rf ggml/build/rocm +//go:generate cmake -S ggml -B ggml/build/rocm -DLLAMA_CLBLAST=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on +//go:generate cmake --build ggml/build/rocm --target server --config Release +//go:generate mv ggml/build/rocm/bin/server ggml/build/rocm/bin/ollama-runner + +//go:generate rm -rf gguf/build/rocm +//go:generate cmake -S gguf -B gguf/build/rocm -DLLAMA_HIPBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' +//go:generate cmake --build gguf/build/rocm --target server --config Release +//go:generate mv gguf/build/rocm/bin/server gguf/build/rocm/bin/ollama-runner From 35934b2e05cd598a6de0a1ed1ef62c11fb078f36 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 29 Nov 2023 11:00:37 -0800 Subject: [PATCH 05/19] Adapted rocm support to cgo based llama.cpp --- Dockerfile | 6 +- Dockerfile.build | 52 ++- README.md | 20 +- gpu/gpu.go | 119 ++++++ {llm => gpu}/gpu_darwin.go | 19 +- gpu/gpu_info.h | 49 +++ gpu/gpu_info_cpu.c | 42 ++ gpu/gpu_info_cuda.c | 110 +++++ gpu/gpu_info_cuda.h | 35 ++ gpu/gpu_info_rocm.c | 111 +++++ gpu/gpu_info_rocm.h | 36 ++ gpu/gpu_test.go | 26 ++ gpu/types.go | 10 + llm/accelerator_cuda.go | 67 --- llm/accelerator_none.go | 21 - llm/accelerator_rocm.go | 85 ---- llm/ext_server.go | 374 +++++++++++------ llm/gpu_cuda.go | 57 --- llm/llama.cpp/gen_common.sh | 7 +- llm/llama.cpp/gen_darwin.sh | 3 +- llm/llama.cpp/gen_linux.sh | 66 ++- llm/llama.cpp/gen_windows.ps1 | 6 +- llm/llama.cpp/generate_linux.go | 2 +- llm/llama.cpp/generate_linux_cuda.go | 24 -- llm/llama.cpp/generate_linux_rocm.go | 25 -- .../0001-Expose-callable-API-for-server.patch | 397 ++++++++++-------- llm/llama.go | 10 +- llm/llm.go | 16 +- llm/rocm_shim.c | 134 ++++++ llm/rocm_shim.h | 73 ++++ llm/shim_darwin.go | 18 + llm/shim_ext_server.go | 212 ++++++++++ scripts/build_linux.sh | 2 +- scripts/build_remote.py | 68 +++ scripts/setup_integration_tests.sh | 2 +- server/llm_test.go | 20 +- server/routes.go | 22 +- 37 files changed, 1688 insertions(+), 658 deletions(-) create mode 100644 gpu/gpu.go rename {llm => gpu}/gpu_darwin.go (60%) create mode 100644 gpu/gpu_info.h create mode 100644 gpu/gpu_info_cpu.c create mode 100644 gpu/gpu_info_cuda.c create mode 100644 gpu/gpu_info_cuda.h create mode 100644 gpu/gpu_info_rocm.c create mode 100644 gpu/gpu_info_rocm.h create mode 100644 gpu/gpu_test.go create mode 100644 gpu/types.go delete mode 100644 llm/accelerator_cuda.go delete mode 100644 llm/accelerator_none.go delete mode 100644 llm/accelerator_rocm.go delete mode 100644 llm/gpu_cuda.go delete mode 100644 llm/llama.cpp/generate_linux_cuda.go delete mode 100644 llm/llama.cpp/generate_linux_rocm.go create mode 100644 llm/rocm_shim.c create mode 100644 llm/rocm_shim.h create mode 100644 llm/shim_darwin.go create mode 100644 llm/shim_ext_server.go create mode 100755 scripts/build_remote.py diff --git a/Dockerfile b/Dockerfile index 7c882852..c50665b6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,8 +11,8 @@ RUN mkdir -p /usr/local && tar xz -C /usr/local /etc/apt/keyrings/rocm.gpg && \ + echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \ + echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \ + echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \ + echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \ + apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev -# centos8 arm64 dependencies -FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64 -RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* -RUN yum install -y git cmake +ENV ROCM_PATH=/opt/rocm + +# Ubuntu 22.04 arm64 dependencies +FROM --platform=linux/arm64 nvidia/cuda:11.7.1-devel-ubuntu22.04 AS base-arm64 +RUN apt-get update && \ + apt-get install -y wget && \ + wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \ + chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr FROM base-${TARGETARCH} ARG TARGETARCH ARG GOFLAGS="'-ldflags -w -s'" +ARG CGO_CFLAGS +ARG CLBLAST_VER=1.6.1 + +# Common toolchain +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-11 g++-11 cpp-11 git ocl-icd-opencl-dev && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 --slave /usr/bin/g++ g++ /usr/bin/g++-11 --slave /usr/bin/gcov gcov /usr/bin/gcov-11 + +# CLBlast +RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \ + cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install # install go ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz @@ -26,6 +51,7 @@ COPY . . ENV GOOS=linux ENV GOARCH=$TARGETARCH ENV GOFLAGS=$GOFLAGS +ENV CGO_CFLAGS=${CGO_CFLAGS} -RUN /usr/local/go/bin/go generate -tags cuda ./... && \ - /usr/local/go/bin/go build -tags cuda . +RUN /usr/local/go/bin/go generate ./... && \ + /usr/local/go/bin/go build . diff --git a/README.md b/README.md index 923290d5..84f94089 100644 --- a/README.md +++ b/README.md @@ -185,8 +185,6 @@ ollama list ## Building -### Generic (CPU) - Install `cmake` and `go`: ``` @@ -202,32 +200,36 @@ Then build the binary: go build . ``` -### CUDA (NVIDIA) +### Linux/Windows CUDA (NVIDIA) *Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!* -Install `cmake` and `golang` as well as [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) development and runtime packages. +Note: at present, Ollama is optimized for GPU usage on linux, and requires the CUDA libraries at a minimum to compile even if you do not have an NVIDIA GPU. + +Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages. Then generate dependencies: ``` -go generate -tags cuda ./... +go generate ./... ``` Then build the binary: ``` -go build -tags cuda . +go build . ``` -### ROCm (AMD) +### Linux ROCm (AMD) *Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!* Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`. Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies: ``` -CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate -tags rocm ./... +CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./... ``` Then build the binary: ``` -go build -tags rocm +go build . ``` +ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root. + ### Running local builds Next, start the server: diff --git a/gpu/gpu.go b/gpu/gpu.go new file mode 100644 index 00000000..146c711e --- /dev/null +++ b/gpu/gpu.go @@ -0,0 +1,119 @@ +//go:build linux || windows + +package gpu + +/* +#include "gpu_info.h" + +*/ +import "C" +import ( + "fmt" + "log" + "sync" + "unsafe" + + "github.com/jmorganca/ollama/api" +) + +type handles struct { + cuda *C.cuda_handle_t + rocm *C.rocm_handle_t +} + +var gpuMutex sync.Mutex +var gpuHandles *handles = nil + +// Note: gpuMutex must already be held +func initGPUHandles() { + log.Printf("Detecting GPU type") + gpuHandles = &handles{nil, nil} + var resp C.cuda_init_resp_t + C.cuda_init(&resp) + if resp.err != nil { + log.Printf("CUDA not detected: %s", C.GoString(resp.err)) + C.free(unsafe.Pointer(resp.err)) + + var resp C.rocm_init_resp_t + C.rocm_init(&resp) + if resp.err != nil { + log.Printf("ROCm not detected: %s", C.GoString(resp.err)) + C.free(unsafe.Pointer(resp.err)) + } else { + log.Printf("Radeon GPU detected") + rocm := resp.rh + gpuHandles.rocm = &rocm + } + } else { + log.Printf("Nvidia GPU detected") + cuda := resp.ch + gpuHandles.cuda = &cuda + } +} + +func GetGPUInfo() GpuInfo { + // TODO - consider exploring lspci (and equivalent on windows) to check for + // GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries + gpuMutex.Lock() + defer gpuMutex.Unlock() + if gpuHandles == nil { + initGPUHandles() + } + + var memInfo C.mem_info_t + var resp GpuInfo + if gpuHandles.cuda != nil { + C.cuda_check_vram(*gpuHandles.cuda, &memInfo) + resp.Driver = "CUDA" + } else if gpuHandles.rocm != nil { + C.rocm_check_vram(*gpuHandles.rocm, &memInfo) + resp.Driver = "ROCM" + } else { + C.cpu_check_ram(&memInfo) + resp.Driver = "CPU" + } + if memInfo.err != nil { + log.Printf("error looking up GPU memory: %s", C.GoString(memInfo.err)) + C.free(unsafe.Pointer(memInfo.err)) + } + resp.FreeMemory = uint64(memInfo.free) + resp.TotalMemory = uint64(memInfo.total) + return resp +} + +func CheckVRAM() (int64, error) { + gpuInfo := GetGPUInfo() + if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" { + return int64(gpuInfo.FreeMemory), nil + } + return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation +} + +func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { + if opts.NumGPU != -1 { + return opts.NumGPU + } + info := GetGPUInfo() + if info.Driver == "CPU" { + return 0 + } + + /* + Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers. + We can store the model weights and the kv cache in vram, + to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file. + */ + bytesPerLayer := uint64(fileSizeBytes / numLayer) + + // 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors + layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4 + + // TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU + // if int64(layers) < numLayer { + // log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024)) + // return 0 + // } + log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", info.FreeMemory/(1024*1024), layers, numLayer) + + return layers +} diff --git a/llm/gpu_darwin.go b/gpu/gpu_darwin.go similarity index 60% rename from llm/gpu_darwin.go rename to gpu/gpu_darwin.go index 39ee4f75..e4a9456a 100644 --- a/llm/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -1,7 +1,8 @@ //go:build darwin -package llm +package gpu +import "C" import ( "github.com/jmorganca/ollama/api" ) @@ -9,11 +10,25 @@ import ( // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs func CheckVRAM() (int64, error) { // TODO - assume metal, and return free memory? - return 0, errNvidiaSMI + return 0, nil } +func GetGPUInfo() GpuInfo { + // TODO - Metal vs. x86 macs... + + return GpuInfo{ + Driver: "METAL", + TotalMemory: 0, + FreeMemory: 0, + } +} + func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { // default to enable metal on macOS return 1 } + +func nativeInit() error { + return nil +} diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h new file mode 100644 index 00000000..7de36465 --- /dev/null +++ b/gpu/gpu_info.h @@ -0,0 +1,49 @@ +#ifndef __APPLE__ +#ifndef __GPU_INFO_H__ +#define __GPU_INFO_H__ +#include +#include +#include + +#ifndef _WIN32 +#include +#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags) +#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym) +#define LOAD_ERR() dlerror() +#define UNLOAD_LIBRARY(handle) dlclose(handle) +#else +#include +#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib) +#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym) +#define UNLOAD_LIBRARY(handle) FreeLibrary(handle) + +// TODO - refactor this with proper error message handling on windows +inline static char *LOAD_ERR() { + static char errbuf[8]; + snprintf(errbuf, 8, "0x%lx", GetLastError()); + return errbuf; +} + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct mem_info { + uint64_t total; + uint64_t free; + char *err; // If non-nill, caller responsible for freeing +} mem_info_t; + +void cpu_check_ram(mem_info_t *resp); + +#ifdef __cplusplus +} +#endif + +#include "gpu_info_cuda.h" +#include "gpu_info_rocm.h" + +#endif // __GPU_INFO_H__ +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_cpu.c b/gpu/gpu_info_cpu.c new file mode 100644 index 00000000..a7987cd4 --- /dev/null +++ b/gpu/gpu_info_cpu.c @@ -0,0 +1,42 @@ +#include "gpu_info.h" +// Fallbacks for CPU mode + +#ifdef _WIN32 +#include +void cpu_check_ram(mem_info_t *resp) { + resp->err = NULL; + MEMORYSTATUSEX info; + if (GlobalMemoryStatusEx(&info) != 0) { + resp->total = info.ullTotalPhys; + resp->free = info.ullAvailPhys; + } else { + resp->err = strdup(LOAD_ERR()); + } + return; +} + +#elif __linux__ +#include +#include +#include +void cpu_check_ram(mem_info_t *resp) { + struct sysinfo info; + resp->err = NULL; + if (sysinfo(&info) != 0) { + resp->err = strdup(strerror(errno)); + } else { + resp->total = info.totalram * info.mem_unit; + resp->free = info.freeram * info.mem_unit; + } + return; +} + +#elif __APPLE__ +// TODO consider an Apple implementation that does something useful +// mem_info_t cpu_check_ram() { +// mem_info_t resp = {0, 0, NULL}; +// return resp; +// } +#else +#error "Unsupported platform" +#endif diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c new file mode 100644 index 00000000..0b2ac867 --- /dev/null +++ b/gpu/gpu_info_cuda.c @@ -0,0 +1,110 @@ +#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? + +#include "gpu_info_cuda.h" + +#include + +#ifndef _WIN32 +const char *cuda_lib_paths[] = { + "libnvidia-ml.so", + "/usr/local/cuda/lib64/libnvidia-ml.so", + NULL, +}; +#else +const char *cuda_lib_paths[] = { + "nvml.dll", + "", + NULL, +}; +#endif + +void cuda_init(cuda_init_resp_t *resp) { + resp->err = NULL; + const int buflen = 256; + char buf[buflen + 1]; + int i; + + struct lookup { + char *s; + void **p; + } l[4] = { + {"nvmlInit_v2", (void *)&resp->ch.initFn}, + {"nvmlShutdown", (void *)&resp->ch.shutdownFn}, + {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle}, + {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo}, + }; + + for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) { + resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY); + } + if (!resp->ch.handle) { + snprintf(buf, buflen, + "Unable to load %s library to query for Nvidia GPUs: %s", + cuda_lib_paths[0], LOAD_ERR()); + resp->err = strdup(buf); + return; + } + + for (i = 0; i < 4; i++) { // TODO - fix this to use a null terminated list + *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); + if (!l[i].p) { + UNLOAD_LIBRARY(resp->ch.handle); + resp->ch.handle = NULL; + snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, + LOAD_ERR()); + resp->err = strdup(buf); + return; + } + } + return; +} + +void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { + resp->err = NULL; + nvmlDevice_t device; + nvmlMemory_t memInfo = {0}; + nvmlReturn_t ret; + const int buflen = 256; + char buf[buflen + 1]; + int i; + + if (h.handle == NULL) { + resp->err = strdup("nvml handle sn't initialized"); + return; + } + + ret = (*h.initFn)(); + if (ret != NVML_SUCCESS) { + snprintf(buf, buflen, "nvml vram init failure: %d", ret); + resp->err = strdup(buf); + return; + } + + // TODO - handle multiple GPUs + ret = (*h.getHandle)(0, &device); + if (ret != NVML_SUCCESS) { + (*h.shutdownFn)(); + snprintf(buf, buflen, "unable to get device handle: %d", ret); + resp->err = strdup(buf); + return; + } + + ret = (*h.getMemInfo)(device, &memInfo); + if (ret != NVML_SUCCESS) { + (*h.shutdownFn)(); + snprintf(buf, buflen, "device memory info lookup failure: %d", ret); + resp->err = strdup(buf); + return; + } + resp->total = memInfo.total; + resp->free = memInfo.free; + + ret = (*h.shutdownFn)(); + if (ret != NVML_SUCCESS) { + snprintf(buf, buflen, "nvml vram shutdown failure: %d", ret); + resp->err = strdup(buf); + } + + return; +} +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_cuda.h b/gpu/gpu_info_cuda.h new file mode 100644 index 00000000..7d13cb6a --- /dev/null +++ b/gpu/gpu_info_cuda.h @@ -0,0 +1,35 @@ +#ifndef __APPLE__ +#ifndef __GPU_INFO_CUDA_H__ +#define __GPU_INFO_CUDA_H__ +#include "gpu_info.h" + +// Just enough typedef's to dlopen/dlsym for memory information +typedef enum nvmlReturn_enum { + NVML_SUCCESS = 0, + // Other values omitted for now... +} nvmlReturn_t; +typedef void *nvmlDevice_t; // Opaque is sufficient +typedef struct nvmlMemory_st { + unsigned long long total; + unsigned long long free; + unsigned long long used; +} nvmlMemory_t; + +typedef struct cuda_handle { + void *handle; + nvmlReturn_t (*initFn)(void); + nvmlReturn_t (*shutdownFn)(void); + nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *); + nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *); +} cuda_handle_t; + +typedef struct cuda_init_resp { + char *err; // If err is non-null handle is invalid + cuda_handle_t ch; +} cuda_init_resp_t; + +void cuda_init(cuda_init_resp_t *resp); +void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp); + +#endif // __GPU_INFO_CUDA_H__ +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c new file mode 100644 index 00000000..88bd2dad --- /dev/null +++ b/gpu/gpu_info_rocm.c @@ -0,0 +1,111 @@ +#ifndef __APPLE__ + +#include "gpu_info_rocm.h" + +#include + +#ifndef _WIN32 +const char *rocm_lib_paths[] = { + "librocm_smi64.so", + "/opt/rocm/lib/librocm_smi64.so", + NULL, +}; +#else +// TODO untested +const char *rocm_lib_paths[] = { + "rocm_smi64.dll", + "/opt/rocm/lib/rocm_smi64.dll", + NULL, +}; +#endif + +void rocm_init(rocm_init_resp_t *resp) { + resp->err = NULL; + const int buflen = 256; + char buf[buflen + 1]; + int i; + struct lookup { + char *s; + void **p; + } l[4] = { + {"rsmi_init", (void *)&resp->rh.initFn}, + {"rsmi_shut_down", (void *)&resp->rh.shutdownFn}, + {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn}, + {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn}, + // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle }, + }; + + for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) { + resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY); + } + if (!resp->rh.handle) { + snprintf(buf, buflen, + "Unable to load %s library to query for Radeon GPUs: %s\n", + rocm_lib_paths[0], LOAD_ERR()); + resp->err = strdup(buf); + return; + } + + for (i = 0; i < 4; i++) { + *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s); + if (!l[i].p) { + UNLOAD_LIBRARY(resp->rh.handle); + snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, + LOAD_ERR()); + resp->err = strdup(buf); + return; + } + } + return; +} + +void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { + resp->err = NULL; + // uint32_t num_devices; + // uint16_t device; + uint64_t totalMem = 0; + uint64_t usedMem = 0; + rsmi_status_t ret; + const int buflen = 256; + char buf[buflen + 1]; + int i; + + ret = (*h.initFn)(0); + if (ret != RSMI_STATUS_SUCCESS) { + snprintf(buf, buflen, "rocm vram init failure: %d", ret); + resp->err = strdup(buf); + return; + } + + // TODO - iterate through devices... ret = + // rsmi_num_monitor_devices(&num_devices); + + // ret = (*h.getHandle)(0, &device); + // if (ret != RSMI_STATUS_SUCCESS) { + // printf("rocm vram device lookup failure: %d\n", ret); + // return -1; + // } + + // Get total memory - used memory for available memory + ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem); + if (ret != RSMI_STATUS_SUCCESS) { + (*h.shutdownFn)(); + snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret); + resp->err = strdup(buf); + return; + } + ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem); + if (ret != RSMI_STATUS_SUCCESS) { + (*h.shutdownFn)(); + snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret); + resp->err = strdup(buf); + return; + } + + (*h.shutdownFn)(); + resp->total = totalMem; + resp->free = totalMem - usedMem; + return; +} + +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_rocm.h b/gpu/gpu_info_rocm.h new file mode 100644 index 00000000..8d7a04ae --- /dev/null +++ b/gpu/gpu_info_rocm.h @@ -0,0 +1,36 @@ +#ifndef __APPLE__ +#ifndef __GPU_INFO_ROCM_H__ +#define __GPU_INFO_ROCM_H__ +#include "gpu_info.h" + +// Just enough typedef's to dlopen/dlsym for memory information +typedef enum rsmi_status_return { + RSMI_STATUS_SUCCESS = 0, + // Other values omitted for now... +} rsmi_status_t; + +typedef enum rsmi_memory_type { + RSMI_MEM_TYPE_VRAM = 0, + RSMI_MEM_TYPE_VIS_VRAM, + RSMI_MEM_TYPE_GTT, +} rsmi_memory_type_t; + +typedef struct rocm_handle { + void *handle; + rsmi_status_t (*initFn)(uint64_t); + rsmi_status_t (*shutdownFn)(void); + rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); + rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); + // rsmi_status_t (*getHandle)(uint32_t, uint16_t *); +} rocm_handle_t; + +typedef struct rocm_init_resp { + char *err; // If err is non-null handle is invalid + rocm_handle_t rh; +} rocm_init_resp_t; + +void rocm_init(rocm_init_resp_t *resp); +void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp); + +#endif // __GPU_INFO_ROCM_H__ +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_test.go b/gpu/gpu_test.go new file mode 100644 index 00000000..cbdcf3ec --- /dev/null +++ b/gpu/gpu_test.go @@ -0,0 +1,26 @@ +package gpu + +import ( + "runtime" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestBasicGetGPUInfo(t *testing.T) { + info := GetGPUInfo() + assert.Contains(t, "CUDA ROCM CPU METAL", info.Driver) + + switch runtime.GOOS { + case "darwin": + // TODO - remove this once MacOS returns some size for CPU + return + case "linux", "windows": + assert.Greater(t, info.TotalMemory, uint64(0)) + assert.Greater(t, info.FreeMemory, uint64(0)) + default: + return + } +} + +// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected diff --git a/gpu/types.go b/gpu/types.go new file mode 100644 index 00000000..a84a0a8d --- /dev/null +++ b/gpu/types.go @@ -0,0 +1,10 @@ +package gpu + +// Beginning of an `ollama info` command +type GpuInfo struct { + Driver string `json:"driver,omitempty"` + TotalMemory uint64 `json:"total_memory,omitempty"` + FreeMemory uint64 `json:"free_memory,omitempty"` + + // TODO add other useful attributes about the card here for discovery information +} diff --git a/llm/accelerator_cuda.go b/llm/accelerator_cuda.go deleted file mode 100644 index f21d6d62..00000000 --- a/llm/accelerator_cuda.go +++ /dev/null @@ -1,67 +0,0 @@ -//go:build cuda - -package llm - -import ( - "bufio" - "bytes" - "errors" - "fmt" - "log" - "os/exec" - "path" - "strconv" - "strings" - - "github.com/jmorganca/ollama/format" -) - -var ( - errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed") - errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only") -) - -// acceleratedRunner returns the runner for this accelerator given the provided buildPath string. -func acceleratedRunner(buildPath string) []ModelRunner { - return []ModelRunner{ - ModelRunner{ - Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), - Accelerated: true, - }, - } -} - -// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs -func CheckVRAM() (int64, error) { - cmd := exec.Command("nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits") - var stdout bytes.Buffer - cmd.Stdout = &stdout - err := cmd.Run() - if err != nil { - return 0, errNoAccel - } - - var freeMiB int64 - scanner := bufio.NewScanner(&stdout) - for scanner.Scan() { - line := scanner.Text() - if strings.Contains(line, "[Insufficient Permissions]") { - return 0, fmt.Errorf("GPU support may not enabled, check you have installed GPU drivers and have the necessary permissions to run nvidia-smi") - } - - vram, err := strconv.ParseInt(strings.TrimSpace(line), 10, 64) - if err != nil { - return 0, fmt.Errorf("failed to parse available VRAM: %v", err) - } - - freeMiB += vram - } - - freeBytes := freeMiB * 1024 * 1024 - if freeBytes < 2*format.GigaByte { - log.Printf("less than 2 GB VRAM available") - return 0, errAvailableVRAM - } - - return freeBytes, nil -} diff --git a/llm/accelerator_none.go b/llm/accelerator_none.go deleted file mode 100644 index 442d884a..00000000 --- a/llm/accelerator_none.go +++ /dev/null @@ -1,21 +0,0 @@ -//go:build !rocm && !cuda - -package llm - -import ( - "errors" -) - -var ( - errNoAccel = errors.New("no accelerator support in this binary") -) - -// acceleratedRunner returns the runner for this accelerator given the provided buildPath string. -func acceleratedRunner(buildPath string) []ModelRunner { - return make([]ModelRunner, 0, 1) -} - -// CheckVRAM is a stub with no accelerator. -func CheckVRAM() (int64, error) { - return 0, errNoGPU -} diff --git a/llm/accelerator_rocm.go b/llm/accelerator_rocm.go deleted file mode 100644 index e71b4ea6..00000000 --- a/llm/accelerator_rocm.go +++ /dev/null @@ -1,85 +0,0 @@ -//go:build rocm - -package llm - -import ( - "bytes" - "encoding/csv" - "errors" - "fmt" - "io" - "log" - "os" - "os/exec" - "path" - "path/filepath" - "strconv" - "strings" -) - -var errNoAccel = errors.New("rocm-smi command failed") - -// acceleratedRunner returns the runner for this accelerator given the provided buildPath string. -func acceleratedRunner(buildPath string) []ModelRunner { - return []ModelRunner{ - ModelRunner{ - Path: path.Join(buildPath, "rocm", "bin", "ollama-runner"), - Accelerated: true, - }, - } -} - -// CheckVRAM returns the available VRAM in MiB on Linux machines with AMD GPUs -func CheckVRAM() (int64, error) { - rocmHome := os.Getenv("ROCM_PATH") - if rocmHome == "" { - rocmHome = os.Getenv("ROCM_HOME") - } - if rocmHome == "" { - log.Println("warning: ROCM_PATH is not set. Trying a likely fallback path, but it is recommended to set this variable in the environment.") - rocmHome = "/opt/rocm" - } - cmd := exec.Command(filepath.Join(rocmHome, "bin/rocm-smi"), "--showmeminfo", "VRAM", "--csv") - var stdout bytes.Buffer - cmd.Stdout = &stdout - err := cmd.Run() - if err != nil { - return 0, errNoAccel - } - csvData := csv.NewReader(&stdout) - // llama.cpp or ROCm don't seem to understand splitting the VRAM allocations across them properly, so try to find the biggest card instead :(. FIXME. - totalBiggestCard := int64(0) - bigCardName := "" - for { - record, err := csvData.Read() - if err == io.EOF { - break - } - if err != nil { - return 0, fmt.Errorf("failed to parse available VRAM: %v", err) - } - if !strings.HasPrefix(record[0], "card") { - continue - } - cardTotal, err := strconv.ParseInt(record[1], 10, 64) - if err != nil { - return 0, err - } - cardUsed, err := strconv.ParseInt(record[2], 10, 64) - if err != nil { - return 0, err - } - possible := (cardTotal - cardUsed) - log.Printf("ROCm found %d MiB of available VRAM on device %q", possible/1024/1024, record[0]) - if possible > totalBiggestCard { - totalBiggestCard = possible - bigCardName = record[0] - } - } - if totalBiggestCard == 0 { - log.Printf("found ROCm GPU but failed to parse free VRAM!") - return 0, errNoAccel - } - log.Printf("ROCm selecting device %q", bigCardName) - return totalBiggestCard, nil -} diff --git a/llm/ext_server.go b/llm/ext_server.go index 6e31dca7..bd026043 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -1,7 +1,7 @@ package llm /* -#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common +#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 #cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable @@ -25,6 +25,8 @@ package llm #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a + +// Note: the following requires cuda library presence on linux to build, even if you only have rocm or CPU only #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a @@ -35,7 +37,7 @@ package llm #cgo windows LDFLAGS: -lext_server_shared -lpthread #include -#include "examples/server/server.h" +#include "server.h" */ import "C" @@ -43,25 +45,51 @@ import ( "bytes" "context" "encoding/json" - "errors" "fmt" "log" "os" "runtime" + "strings" "sync" "time" "unsafe" "github.com/jmorganca/ollama/api" + "github.com/jmorganca/ollama/gpu" ) -func errWrap(resp C.ext_server_err) error { - if resp.code == 0 { - return nil +func newExtServerResp(len C.size_t) C.ext_server_resp_t { + var resp C.ext_server_resp_t + resp.msg_len = len + bytes := make([]byte, len) + resp.msg = (*C.char)(C.CBytes(bytes)) + return resp +} + +func freeExtServerResp(resp C.ext_server_resp_t) { + if resp.msg_len == 0 { + return } - err := fmt.Errorf(C.GoString(resp.err)) - C.free(unsafe.Pointer(resp.err)) - return err + C.free(unsafe.Pointer(resp.msg)) +} + +func extServerResponseToErr(resp C.ext_server_resp_t) error { + return fmt.Errorf(C.GoString(resp.msg)) +} + +type extServer interface { + LLM + llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) + llama_server_start() + llama_server_stop() + llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) + llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) + llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) + llama_server_release_task_result(result *C.ext_server_task_result_t) + llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) + llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) + llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) + llama_server_release_json_resp(json_resp **C.char) } type llamaExtServer struct { @@ -71,21 +99,61 @@ type llamaExtServer struct { // Note: current implementation does not support concurrent instantiations var mutex sync.Mutex -func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (*llamaExtServer, error) { +func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) { + C.llama_server_init(sparams, err) +} +func (llm *llamaExtServer) llama_server_start() { + C.llama_server_start() +} +func (llm *llamaExtServer) llama_server_stop() { + C.llama_server_stop() +} + +func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) { + C.llama_server_completion(json_req, resp) +} +func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) { + C.llama_server_completion_next_result(task_id, resp) +} +func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) { + C.llama_server_completion_cancel(task_id, err) +} +func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) { + C.llama_server_release_task_result(result) +} + +func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { + C.llama_server_tokenize(json_req, json_resp, err) +} +func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { + C.llama_server_detokenize(json_req, json_resp, err) +} +func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { + C.llama_server_embedding(json_req, json_resp, err) +} +func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) { + C.llama_server_release_json_resp(json_resp) +} + +func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { + server := &llamaExtServer{opts} + return newExtServer(server, model, adapters, projectors, numLayers, opts) +} + +func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { if !mutex.TryLock() { log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete") mutex.Lock() } - server := &llamaExtServer{opts} fileInfo, err := os.Stat(model) if err != nil { return nil, err } - var sparams C.ext_server_params + var sparams C.ext_server_params_t sparams.model = C.CString(model) defer C.free(unsafe.Pointer(sparams.model)) - numGPU := NumGPU(numLayers, fileInfo.Size(), opts) + numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts) sparams.embedding = true sparams.n_ctx = C.uint(opts.NumCtx) @@ -97,10 +165,14 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in // Always use the value encoded in the model sparams.rope_freq_base = 0.0 sparams.rope_freq_scale = 0.0 + sparams.memory_f16 = C.bool(opts.F16KV) + sparams.use_mlock = C.bool(opts.UseMLock) + sparams.use_mmap = C.bool(opts.UseMMap) + sparams.numa = C.bool(opts.UseNUMA) sparams.lora_adapters = nil for i := 0; i < len(adapters); i++ { - la := (*C.ext_server_lora_adapter)(C.malloc(C.sizeof_struct_ext_server_lora_adapter)) + la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t)) defer C.free(unsafe.Pointer(la)) la.adapter = C.CString(adapters[i]) defer C.free(unsafe.Pointer(la.adapter)) @@ -116,11 +188,13 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in } } - // TODO - implement ME - // if len(projectors) > 0 { - // // TODO: applying multiple projectors is not supported by the llama.cpp server yet - // params = append(params, "--mmproj", projectors[0]) - // } + if len(projectors) > 0 { + // TODO: applying multiple projectors is not supported by the llama.cpp server yet + sparams.mmproj = C.CString(projectors[0]) + defer C.free(unsafe.Pointer(sparams.mmproj)) + } else { + sparams.mmproj = nil + } if opts.NumThread > 0 { sparams.n_threads = C.uint(opts.NumThread) @@ -128,136 +202,167 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in sparams.n_threads = C.uint(runtime.NumCPU()) } - sparams.memory_f16 = false - if opts.F16KV { - sparams.memory_f16 = true - } - sparams.use_mlock = false - if opts.UseMLock { - sparams.use_mlock = true - } - sparams.use_mmap = true - if !opts.UseMMap { - sparams.use_mmap = false - } - sparams.numa = false - if opts.UseNUMA { - sparams.numa = true - } - log.Printf("Initializing internal llama server") - err = errWrap(C.llama_server_init(&sparams)) - if err != nil { - return nil, err + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + server.llama_server_init(&sparams, &resp) + if resp.id < 0 { + return nil, extServerResponseToErr(resp) } log.Printf("Starting internal llama main loop") - C.llama_server_start() + server.llama_server_start() return server, nil } -func (llm *llamaExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error { +func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error { + return predict(llm, llm.Options, ctx, pred, fn) +} + +func predict(llm extServer, opts api.Options, ctx context.Context, predict PredictOpts, fn func(PredictResult)) error { + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + var imageData []ImageData + if len(predict.Images) > 0 { + for cnt, i := range predict.Images { + imageData = append(imageData, ImageData{Data: i, ID: cnt}) + } + } + log.Printf("loaded %d images", len(imageData)) request := map[string]any{ "prompt": predict.Prompt, "stream": true, - "n_predict": llm.NumPredict, - "n_keep": llm.NumKeep, - "temperature": llm.Temperature, - "top_k": llm.TopK, - "top_p": llm.TopP, - "tfs_z": llm.TFSZ, - "typical_p": llm.TypicalP, - "repeat_last_n": llm.RepeatLastN, - "repeat_penalty": llm.RepeatPenalty, - "presence_penalty": llm.PresencePenalty, - "frequency_penalty": llm.FrequencyPenalty, - "mirostat": llm.Mirostat, - "mirostat_tau": llm.MirostatTau, - "mirostat_eta": llm.MirostatEta, - "penalize_nl": llm.PenalizeNewline, - "seed": llm.Seed, - "stop": llm.Stop, + "n_predict": opts.NumPredict, + "n_keep": opts.NumKeep, + "temperature": opts.Temperature, + "top_k": opts.TopK, + "top_p": opts.TopP, + "tfs_z": opts.TFSZ, + "typical_p": opts.TypicalP, + "repeat_last_n": opts.RepeatLastN, + "repeat_penalty": opts.RepeatPenalty, + "presence_penalty": opts.PresencePenalty, + "frequency_penalty": opts.FrequencyPenalty, + "mirostat": opts.Mirostat, + "mirostat_tau": opts.MirostatTau, + "mirostat_eta": opts.MirostatEta, + "penalize_nl": opts.PenalizeNewline, + "seed": opts.Seed, + "stop": opts.Stop, + "image_data": imageData, } if predict.Format == "json" { request["grammar"] = jsonGrammar } - // Handling JSON marshaling with special characters unescaped. - buffer := &bytes.Buffer{} - enc := json.NewEncoder(buffer) - enc.SetEscapeHTML(false) + retryDelay := 100 * time.Microsecond + for retries := 0; retries < maxRetries; retries++ { + if retries > 0 { + time.Sleep(retryDelay) // wait before retrying + retryDelay *= 2 // exponential backoff + } - if err := enc.Encode(request); err != nil { - return fmt.Errorf("failed to marshal data: %w", err) - } + // Handling JSON marshaling with special characters unescaped. + buffer := &bytes.Buffer{} + enc := json.NewEncoder(buffer) + enc.SetEscapeHTML(false) - req := C.CString(buffer.String()) - defer C.free(unsafe.Pointer(req)) + if err := enc.Encode(request); err != nil { + return fmt.Errorf("failed to marshal data: %w", err) + } - cmpCtx := C.llama_server_completion(req) - if cmpCtx.task_id < 0 { - defer C.free(unsafe.Pointer(cmpCtx.err)) - return fmt.Errorf(C.GoString(cmpCtx.err)) - } + req := C.CString(buffer.String()) + defer C.free(unsafe.Pointer(req)) - for { - select { - case <-ctx.Done(): - // This handles the request cancellation - return errWrap(C.llama_server_completion_cancel(cmpCtx.task_id)) - default: - result := C.llama_server_completion_next_result(cmpCtx.task_id) - if result.result_json != nil { - defer C.free(unsafe.Pointer(result.result_json)) - } - var p prediction - if err := json.Unmarshal([]byte(C.GoString(result.result_json)), &p); err != nil { - err2 := errWrap(C.llama_server_completion_cancel(cmpCtx.task_id)) - return errors.Join(fmt.Errorf("error unmarshaling llm prediction response: %w", err), err2) - } + llm.llama_server_completion(req, &resp) + if resp.id < 0 { + return extServerResponseToErr(resp) + } - if p.Content != "" { - fn(PredictResult{ - // Model: predict.Model, // XXX remove or replace? - CreatedAt: time.Now().UTC(), - Content: p.Content, - }) - } + retryNeeded := false + out: + for { + select { + case <-ctx.Done(): + // This handles the request cancellation + llm.llama_server_completion_cancel(resp.id, &resp) + if resp.id < 0 { + return extServerResponseToErr(resp) + } else { + return nil + } + default: + var result C.ext_server_task_result_t + llm.llama_server_completion_next_result(resp.id, &result) + json_resp := C.GoString(result.json_resp) + llm.llama_server_release_task_result(&result) - if p.Stop { - fn(PredictResult{ - // Model: predict.Model, // XXX remove or replace? - CreatedAt: time.Now().UTC(), - TotalDuration: time.Since(predict.CheckpointStart), - Done: true, - PromptEvalCount: p.Timings.PromptN, - PromptEvalDuration: parseDurationMs(p.Timings.PromptMS), - EvalCount: p.Timings.PredictedN, - EvalDuration: parseDurationMs(p.Timings.PredictedMS), - }) - return nil + var p prediction + if err := json.Unmarshal([]byte(json_resp), &p); err != nil { + llm.llama_server_completion_cancel(resp.id, &resp) + if resp.id < 0 { + return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg)) + } else { + return fmt.Errorf("error unmarshaling llm prediction response: %w", err) + } + } + + if bool(result.error) && strings.Contains(json_resp, "slot unavailable") { + retryNeeded = true + // task will already be canceled + break out + } + + if p.Content != "" { + fn(PredictResult{ + Content: p.Content, + }) + } + + if p.Stop { + fn(PredictResult{ + Done: true, + PromptEvalCount: p.Timings.PromptN, + PromptEvalDuration: parseDurationMs(p.Timings.PromptMS), + EvalCount: p.Timings.PredictedN, + EvalDuration: parseDurationMs(p.Timings.PredictedMS), + }) + return nil + } } } + if !retryNeeded { + return nil // success + } } + + // should never reach here ideally + return fmt.Errorf("max retries exceeded") +} +func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { + return encode(llm, ctx, prompt) } -func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { +func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) { data, err := json.Marshal(TokenizeRequest{Content: prompt}) if err != nil { return nil, fmt.Errorf("marshaling encode data: %w", err) } req := C.CString(string(data)) defer C.free(unsafe.Pointer(req)) - var resp C.ext_server_resp - err = errWrap(C.llama_server_tokenize(req, &resp)) - if resp.json_resp != nil { - defer C.free(unsafe.Pointer(resp.json_resp)) + var json_resp *C.char + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + llm.llama_server_tokenize(req, &json_resp, &resp) + if resp.id < 0 { + return nil, extServerResponseToErr(resp) } + defer llm.llama_server_release_json_resp(&json_resp) var encoded TokenizeResponse - if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &encoded); err2 != nil { + if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil { return nil, fmt.Errorf("unmarshal encode response: %w", err2) } @@ -265,6 +370,10 @@ func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, er } func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) { + return decode(llm, ctx, tokens) +} + +func decode(llm extServer, ctx context.Context, tokens []int) (string, error) { if len(tokens) == 0 { return "", nil } @@ -275,14 +384,17 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er req := C.CString(string(data)) defer C.free(unsafe.Pointer(req)) - var resp C.ext_server_resp - err = errWrap(C.llama_server_detokenize(req, &resp)) - if resp.json_resp != nil { - defer C.free(unsafe.Pointer(resp.json_resp)) + var json_resp *C.char + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + llm.llama_server_detokenize(req, &json_resp, &resp) + if resp.id < 0 { + return "", extServerResponseToErr(resp) } + defer llm.llama_server_release_json_resp(&json_resp) var decoded DetokenizeResponse - if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &decoded); err2 != nil { + if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil { return "", fmt.Errorf("unmarshal encode response: %w", err2) } @@ -290,6 +402,9 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er } func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) { + return embedding(llm, ctx, input) +} +func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) { data, err := json.Marshal(TokenizeRequest{Content: input}) if err != nil { return nil, fmt.Errorf("error marshaling embed data: %w", err) @@ -297,29 +412,28 @@ func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float req := C.CString(string(data)) defer C.free(unsafe.Pointer(req)) - var resp C.ext_server_resp - err = errWrap(C.llama_server_embedding(req, &resp)) - if resp.json_resp != nil { - defer C.free(unsafe.Pointer(resp.json_resp)) - } - if err != nil { - return nil, err + var json_resp *C.char + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + llm.llama_server_embedding(req, &json_resp, &resp) + if resp.id < 0 { + return nil, extServerResponseToErr(resp) } + defer llm.llama_server_release_json_resp(&json_resp) var embedding EmbeddingResponse - if err := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &embedding); err != nil { + if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil { return nil, fmt.Errorf("unmarshal tokenize response: %w", err) } return embedding.Embedding, nil } -func (llm *llamaExtServer) Ping(ctx context.Context) error { - // TODO - consider some mechanism to check if the main loop and llama.cpp are in a good state - return nil +func (llm *llamaExtServer) Close() { + close(llm) } -func (llm *llamaExtServer) Close() { - C.llama_server_stop() +func close(llm extServer) { + llm.llama_server_stop() mutex.Unlock() } diff --git a/llm/gpu_cuda.go b/llm/gpu_cuda.go deleted file mode 100644 index 0afa8e2b..00000000 --- a/llm/gpu_cuda.go +++ /dev/null @@ -1,57 +0,0 @@ -//go:build linux || windows - -package llm - -import ( - "errors" - "log" - - "github.com/jmorganca/ollama/api" -) - -/* -#cgo windows LDFLAGS: -L"/Program Files/NVIDIA Corporation/NVSMI/" -#cgo linux LDFLAGS: -lnvidia-ml - -#include -#include "examples/server/server.h" -*/ -import "C" - -// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs -func CheckVRAM() (int64, error) { - return int64(C.check_vram()), nil -} - -func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { - if opts.NumGPU != -1 { - return opts.NumGPU - } - freeBytes, err := CheckVRAM() - if err != nil { - if !errors.Is(err, errNvidiaSMI) { - log.Print(err.Error()) - } - // nvidia driver not installed or no nvidia GPU found - return 0 - } - - /* - Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers. - We can store the model weights and the kv cache in vram, - to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file. - */ - bytesPerLayer := fileSizeBytes / numLayer - - // 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors - layers := int(freeBytes/bytesPerLayer) * 3 / 4 - - // TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU - // if int64(layers) < numLayer { - // log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024)) - // return 0 - // } - log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", freeBytes/(1024*1024), layers, numLayer) - - return layers -} diff --git a/llm/llama.cpp/gen_common.sh b/llm/llama.cpp/gen_common.sh index f17d19de..2f75104f 100644 --- a/llm/llama.cpp/gen_common.sh +++ b/llm/llama.cpp/gen_common.sh @@ -1,10 +1,11 @@ # common logic accross linux and darwin init_vars() { + LLAMACPP_DIR=gguf PATCHES="0001-Expose-callable-API-for-server.patch" CMAKE_DEFS="-DLLAMA_ACCELERATE=on" # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings - CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server" + CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static" if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}" else @@ -29,6 +30,6 @@ apply_patches() { } build() { - cmake -S gguf -B ${BUILD_DIR} ${CMAKE_DEFS} - cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 + cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS} + cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 } \ No newline at end of file diff --git a/llm/llama.cpp/gen_darwin.sh b/llm/llama.cpp/gen_darwin.sh index 448c595b..f159ceff 100755 --- a/llm/llama.cpp/gen_darwin.sh +++ b/llm/llama.cpp/gen_darwin.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # This script is intended to run inside the go generate # working directory must be ../llm/llama.cpp @@ -30,6 +30,7 @@ git_module_setup apply_patches build +# TODO - improve this to handle test cases that need it to be in "." around the tree # Enable local debug/run usecase if [ -e "gguf/ggml-metal.metal" ]; then cp gguf/ggml-metal.metal ../../ diff --git a/llm/llama.cpp/gen_linux.sh b/llm/llama.cpp/gen_linux.sh index c5405dd8..93c998f4 100755 --- a/llm/llama.cpp/gen_linux.sh +++ b/llm/llama.cpp/gen_linux.sh @@ -1,17 +1,73 @@ -#!/bin/sh +#!/bin/bash # This script is intended to run inside the go generate # working directory must be ../llm/llama.cpp set -ex set -o pipefail -# TODO - stopped here - map the variables from above over and refine the case statement below - echo "Starting linux generate script" +if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ] ; then + export CUDACXX=/usr/local/cuda/bin/nvcc +fi source $(dirname $0)/gen_common.sh init_vars -CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" -BUILD_DIR="gguf/build/cuda" git_module_setup apply_patches +CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" +BUILD_DIR="gguf/build/cuda" +LIB_DIR="${BUILD_DIR}/lib" +mkdir -p ../../dist/ build +# TODO - explore mechanism to soften the hard cuda dependency on linux +# by conditionally building some archive here that aggregates the cuda libs if present +# so that the cgo flags link this intermediate archive instead of the underlying cuda libs +# +# gcc -fPIC -g -shared -o ${LIB_DIR}/libcuda_server.so \ +# -Wl,--whole-archive \ +# ${BUILD_DIR}/examples/server/CMakeFiles/ext_server.dir/server.cpp.o \ +# ${BUILD_DIR}/common/libcommon.a \ +# ${BUILD_DIR}/libllama.a \ +# ${BUILD_DIR}/examples/llava/libllava_static.a \ +# -Wl,--no-whole-archive \ +# -lrt -lpthread -ldl -lstdc++ -lm \ +# /usr/local/cuda/lib64/libcudart_static.a \ +# /usr/local/cuda/lib64/libcublas_static.a \ +# /usr/local/cuda/lib64/libcublasLt_static.a \ +# /usr/local/cuda/lib64/libcudadevrt.a \ +# /usr/local/cuda/lib64/libculibos.a + +if [ -z "${ROCM_PATH}" ] ; then + # Try the default location in case it exists + ROCM_PATH=/opt/rocm +fi + +if [ -z "${CLBlast_DIR}" ] ; then + # Try the default location in case it exists + if [ -d /usr/lib/cmake/CLBlast ]; then + export CLBlast_DIR=/usr/lib/cmake/CLBlast + fi +fi + +BUILD_DIR="gguf/build/rocm" +LIB_DIR="${BUILD_DIR}/lib" +mkdir -p ${LIB_DIR} +# Ensure we have at least one file present for the embed +touch ${LIB_DIR}/.generated + +if [ -d "${ROCM_PATH}" ] ; then + echo "Building ROCm" + init_vars + CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'" + CMAKE_DEFS="-DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + build + gcc -fPIC -g -shared -o ${LIB_DIR}/librocm_server.so \ + -Wl,--whole-archive \ + ${BUILD_DIR}/examples/server/libext_server.a \ + ${BUILD_DIR}/common/libcommon.a \ + ${BUILD_DIR}/libllama.a \ + -Wl,--no-whole-archive \ + -lrt -lpthread -ldl -lstdc++ -lm \ + -L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \ + -Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \ + -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu +fi diff --git a/llm/llama.cpp/gen_windows.ps1 b/llm/llama.cpp/gen_windows.ps1 index 9717b2e7..f85f1a45 100644 --- a/llm/llama.cpp/gen_windows.ps1 +++ b/llm/llama.cpp/gen_windows.ps1 @@ -48,4 +48,8 @@ init_vars git_module_setup apply_patches build -install \ No newline at end of file +install + +# TODO - implement ROCm support on windows +md gguf/build/winrocm/lib -ea 0 +echo $null >> gguf/build/winrocm/lib/.generated diff --git a/llm/llama.cpp/generate_linux.go b/llm/llama.cpp/generate_linux.go index 6782a614..119b5c27 100644 --- a/llm/llama.cpp/generate_linux.go +++ b/llm/llama.cpp/generate_linux.go @@ -1,3 +1,3 @@ package llm -//go:generate sh ./gen_linux.sh +//go:generate bash ./gen_linux.sh diff --git a/llm/llama.cpp/generate_linux_cuda.go b/llm/llama.cpp/generate_linux_cuda.go deleted file mode 100644 index 86a95977..00000000 --- a/llm/llama.cpp/generate_linux_cuda.go +++ /dev/null @@ -1,24 +0,0 @@ -//go:build cuda - -package llm - -//go:generate git submodule init - -//go:generate git submodule update --force ggml -//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch -//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch -//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch -//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch - -//go:generate rm -rf ggml/build/cuda -//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/cuda --target server --config Release -//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner - -//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch -//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch - -//go:generate rm -rf gguf/build/cuda -//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -//go:generate cmake --build gguf/build/cuda --target server --config Release -//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner diff --git a/llm/llama.cpp/generate_linux_rocm.go b/llm/llama.cpp/generate_linux_rocm.go deleted file mode 100644 index 1766be84..00000000 --- a/llm/llama.cpp/generate_linux_rocm.go +++ /dev/null @@ -1,25 +0,0 @@ -//go:build rocm - -package llm - -//go:generate git submodule init - -//go:generate git submodule update --force ggml -//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch -//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch -//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch -//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch - -//go:generate git submodule update --force gguf -//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch -//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch - -//go:generate rm -rf ggml/build/rocm -//go:generate cmake -S ggml -B ggml/build/rocm -DLLAMA_CLBLAST=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/rocm --target server --config Release -//go:generate mv ggml/build/rocm/bin/server ggml/build/rocm/bin/ollama-runner - -//go:generate rm -rf gguf/build/rocm -//go:generate cmake -S gguf -B gguf/build/rocm -DLLAMA_HIPBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -//go:generate cmake --build gguf/build/rocm --target server --config Release -//go:generate mv gguf/build/rocm/bin/server gguf/build/rocm/bin/ollama-runner diff --git a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch index 838347d5..623243d4 100644 --- a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch +++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch @@ -1,15 +1,15 @@ -From 64b3fbb150d12b3ca63ac2fb4e57bc46f41d2ccd Mon Sep 17 00:00:00 2001 +From 087cf3300e973d7790db8f7cad01d2a790de38be Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 13 Nov 2023 12:25:58 -0800 Subject: [PATCH] Expose callable API for server This adds an extern "C" interface within the example server --- - examples/server/CMakeLists.txt | 24 ++++ - examples/server/server.cpp | 247 +++++++++++++++++++++++++++++++++ - examples/server/server.h | 83 +++++++++++ + examples/server/CMakeLists.txt | 24 +++ + examples/server/server.cpp | 274 +++++++++++++++++++++++++++++++++ + examples/server/server.h | 89 +++++++++++ ggml-cuda.cu | 1 + - 4 files changed, 355 insertions(+) + 4 files changed, 388 insertions(+) create mode 100644 examples/server/server.h diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt @@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644 +endif() \ No newline at end of file diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index 895f751..f939590 100644 +index d0cd8e1..5f5d4c5 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,6 +5,9 @@ @@ -59,7 +59,7 @@ index 895f751..f939590 100644 #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error -@@ -2631,6 +2634,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con +@@ -2632,6 +2635,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con } } @@ -67,84 +67,84 @@ index 895f751..f939590 100644 int main(int argc, char **argv) { // own arguments required by this example -@@ -3065,3 +3069,246 @@ int main(int argc, char **argv) +@@ -3066,3 +3070,273 @@ int main(int argc, char **argv) llama_backend_free(); return 0; } + +#else // LLAMA_SERVER_LIBRARY +// Expose the llama server as a callable extern "C" API -+llama_server_context llama; ++llama_server_context *llama = NULL; +std::atomic ext_server_running(false); +std::thread ext_server_thread; -+inline ext_server_err makeErr(uint32_t code, std::string msg) { -+ if (code == 0) { -+ return ext_server_err{0, NULL}; -+ } -+ const std::string::size_type size = msg.size(); -+ ext_server_err ret = { -+ code, -+ new char[size + 1], -+ }; -+ memcpy(ret.err, msg.c_str(), size + 1); -+ return ret; -+} + -+ext_server_err llama_server_init(ext_server_params *sparams) ++void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) +{ -+ log_set_target(stdout); -+ gpt_params params; -+ params.n_ctx = sparams->n_ctx; -+ params.n_batch = sparams->n_batch; -+ params.n_threads = sparams->n_threads; -+ params.n_parallel = sparams->n_parallel; -+ params.rope_freq_base = sparams->rope_freq_base; -+ params.rope_freq_scale = sparams->rope_freq_scale; -+ -+ if (sparams->memory_f16) { -+ params.cache_type_k = "f16"; -+ params.cache_type_v = "f16"; -+ } else { -+ params.cache_type_k = "f32"; -+ params.cache_type_v = "f32"; -+ } -+ -+ params.n_gpu_layers = sparams->n_gpu_layers; -+ params.main_gpu = sparams->main_gpu; -+ params.use_mlock = sparams->use_mlock; -+ params.use_mmap = sparams->use_mmap; -+ params.numa = sparams->numa; -+ params.embedding = sparams->embedding; -+ if (sparams->model != NULL) { -+ params.model = sparams->model; -+ } -+ -+ for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; la = la->next) { -+ params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale)); -+ } -+ ++ assert(err != NULL && sparams != NULL); ++ err->id = 0; ++ err->msg[0] = '\0'; + try { ++ llama = new llama_server_context; ++ log_set_target(stdout); ++ gpt_params params; ++ params.n_ctx = sparams->n_ctx; ++ params.n_batch = sparams->n_batch; ++ params.n_threads = sparams->n_threads; ++ params.n_parallel = sparams->n_parallel; ++ params.rope_freq_base = sparams->rope_freq_base; ++ params.rope_freq_scale = sparams->rope_freq_scale; ++ ++ if (sparams->memory_f16) { ++ params.cache_type_k = "f16"; ++ params.cache_type_v = "f16"; ++ } else { ++ params.cache_type_k = "f32"; ++ params.cache_type_v = "f32"; ++ } ++ ++ params.n_gpu_layers = sparams->n_gpu_layers; ++ params.main_gpu = sparams->main_gpu; ++ params.use_mlock = sparams->use_mlock; ++ params.use_mmap = sparams->use_mmap; ++ params.numa = sparams->numa; ++ params.embedding = sparams->embedding; ++ if (sparams->model != NULL) { ++ params.model = sparams->model; ++ } ++ ++ for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; la = la->next) { ++ params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale)); ++ } ++ ++ if (sparams->mmproj != NULL) { ++ params.mmproj = std::string(sparams->mmproj); ++ } ++ + llama_backend_init(params.numa); + + // load the model -+ if (!llama.load_model(params)) ++ if (!llama->load_model(params)) + { + // TODO - consider modifying the logging logic or patching load_model so we can capture more detailed error messages + // and pass them back to the caller for better UX -+ return makeErr(1, "error loading model " + params.model); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str()); ++ return; + } + -+ llama.initialize(); ++ llama->initialize(); + } catch (std::exception &e) { -+ return makeErr(1, e.what()); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "exception %s", e.what()); + } catch (...) { -+ return makeErr(1, "Unknown Exception initializing llama server"); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "Unknown exception initializing llama server"); + } -+ return makeErr(0, ""); +} + +void llama_server_start() +{ ++ assert(llama != NULL); + // TODO mutex to protect thread creation + ext_server_thread = std::thread([&]() + { @@ -154,7 +154,7 @@ index 895f751..f939590 100644 + ggml_time_init(); + while (ext_server_running.load()) + { -+ if (!llama.update_slots()) { ++ if (!llama->update_slots()) { + LOG_TEE("unexpected error in llama server update_slots - exiting main loop\n"); + break; + } @@ -170,124 +170,150 @@ index 895f751..f939590 100644 +} + +void llama_server_stop() { ++ assert(llama != NULL); + // TODO - too verbose, remove once things are solid + LOG_TEE("requesting llama server shutdown\n"); + ext_server_running = false; + ext_server_thread.join(); ++ delete llama; ++ llama = NULL; + LOG_TEE("llama server shutdown complete\n"); +} + -+ext_server_completion_resp llama_server_completion(const char *json_req) { -+ std::string msg; -+ ext_server_completion_resp resp = { -+ 0, -+ NULL, -+ }; ++void llama_server_completion(const char *json_req, ext_server_resp_t *resp) { ++ assert(llama != NULL && json_req != NULL && resp != NULL); ++ resp->id = -1; ++ resp->msg[0] = '\0'; + try { + json data = json::parse(json_req); -+ resp.task_id = llama.request_completion(data, false, false, -1); -+ return resp; ++ resp->id = llama->request_completion(data, false, false, -1); + } catch (std::exception &e) { -+ msg = e.what(); ++ snprintf(resp->msg, resp->msg_len, "exception %s", e.what()); + } catch (...) { -+ msg = "Unknown Exception during completion"; ++ snprintf(resp->msg, resp->msg_len, "Unknown exception during completion"); + } -+ const std::string::size_type size = msg.size(); -+ resp.task_id = 0; -+ resp.err = new char[size + 1]; -+ memcpy(resp.err, msg.c_str(), size + 1); -+ return resp; +} + -+ext_task_result llama_server_completion_next_result(const int task_id) { ++void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *resp) { ++ assert(llama != NULL && resp != NULL); + std::string msg; -+ ext_task_result resp = {-1,false,false,NULL}; -+ try { -+ task_result result = llama.next_result(task_id); -+ std::string result_json = result.result_json.dump(-1, ' ', false, json::error_handler_t::replace); -+ const std::string::size_type size = result_json.size(); -+ resp.id = result.id; -+ resp.stop = result.stop; -+ resp.error = result.error; -+ resp.result_json = new char[size + 1]; -+ memcpy(resp.result_json, result_json.c_str(), size + 1); -+ if (result.error) { -+ llama.request_cancel(task_id); -+ } else if (result.stop) { -+ llama.request_cancel(task_id); -+ } -+ return resp; -+ } catch (std::exception &e) { -+ msg = e.what(); // TODO - json? -+ } catch (...) { -+ msg = "Unknown Exception during completion"; -+ } -+ resp.error = true; -+ const std::string::size_type size = msg.size(); -+ resp.result_json = new char[size + 1]; -+ memcpy(resp.result_json, msg.c_str(), size + 1); -+ return resp; -+} -+ -+ext_server_err llama_server_completion_cancel(const int task_id) { -+ try { -+ llama.request_cancel(task_id); -+ } catch (std::exception &e) { -+ return makeErr(1, e.what()); -+ } catch (...) { -+ return makeErr(1, "Unknown Exception running llama server"); -+ } -+ return makeErr(0, ""); -+} -+ -+ -+ext_server_err llama_server_tokenize(const char *json_req, ext_server_resp *resp) { ++ resp->id = -1; ++ resp->stop = false; ++ resp->error = false; + resp->json_resp = NULL; ++ std::string result_json; ++ try { ++ task_result result = llama->next_result(task_id); ++ result_json = result.result_json.dump(-1, ' ', false, json::error_handler_t::replace); ++ resp->id = result.id; ++ resp->stop = result.stop; ++ resp->error = result.error; ++ if (result.error) { ++ llama->request_cancel(task_id); ++ } else if (result.stop) { ++ llama->request_cancel(task_id); ++ } ++ } catch (std::exception &e) { ++ resp->error = true; ++ resp->id = -1; ++ result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}"; ++ } catch (...) { ++ resp->error = true; ++ resp->id = -1; ++ result_json = "{\"error\":\"Unknown exception during completion\"}"; ++ } ++ const std::string::size_type size = result_json.size() + 1; ++ resp->json_resp = new char[size]; ++ snprintf(resp->json_resp, size, "%s", result_json.c_str()); ++} ++ ++void llama_server_release_task_result(ext_server_task_result_t *result) { ++ if (result == NULL || result->json_resp == NULL) { ++ return; ++ } ++ delete[] result->json_resp; ++} ++ ++void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) { ++ assert(llama != NULL && err != NULL); ++ err->id = 0; ++ err->msg[0] = '\0'; ++ try { ++ llama->request_cancel(task_id); ++ } catch (std::exception &e) { ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "exception %s", e.what()); ++ } catch (...) { ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "Unknown exception completion cancel in llama server"); ++ } ++} ++ ++void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) { ++ assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL); ++ *json_resp = NULL; ++ err->id = 0; ++ err->msg[0] = '\0'; + try { + const json body = json::parse(json_req); + std::vector tokens; + if (body.count("content") != 0) + { -+ tokens = llama.tokenize(body["content"], false); ++ tokens = llama->tokenize(body["content"], false); + } + const json data = format_tokenizer_response(tokens); + std::string result_json = data.dump(); -+ const std::string::size_type size = result_json.size(); -+ resp->json_resp = new char[size + 1]; -+ memcpy(resp->json_resp, result_json.c_str(), size + 1); ++ const std::string::size_type size = result_json.size() + 1; ++ *json_resp = new char[size]; ++ snprintf(*json_resp, size, "%s", result_json.c_str()); + } catch (std::exception &e) { -+ return makeErr(1, e.what()); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "exception %s", e.what()); + } catch (...) { -+ return makeErr(1, "Unknown Exception during tokenize"); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "Unknown exception during tokenize"); + } -+ return makeErr(0, ""); +} + -+ext_server_err llama_server_detokenize(const char *json_req, ext_server_resp *resp) { -+ resp->json_resp = NULL; ++void llama_server_release_json_resp(char **json_resp) { ++ if (json_resp == NULL || *json_resp == NULL) { ++ return; ++ } ++ delete[] *json_resp; ++} ++ ++void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) { ++ assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL); ++ *json_resp = NULL; ++ err->id = 0; ++ err->msg[0] = '\0'; + try { + const json body = json::parse(json_req); + std::string content; + if (body.count("tokens") != 0) + { + const std::vector tokens = body["tokens"]; -+ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend()); ++ content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend()); + } + const json data = format_detokenized_response(content); + std::string result_json = data.dump(); -+ const std::string::size_type size = result_json.size(); -+ resp->json_resp = new char[size + 1]; -+ memcpy(resp->json_resp, result_json.c_str(), size + 1); ++ const std::string::size_type size = result_json.size() + 1; ++ *json_resp = new char[size]; ++ snprintf(*json_resp, size, "%s", result_json.c_str()); + } catch (std::exception &e) { -+ return makeErr(1, e.what()); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "exception %s", e.what()); + } catch (...) { -+ return makeErr(1, "Unknown Exception during detokenize"); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "Unknown exception during detokenize"); + } -+ return makeErr(0, ""); +} + -+ext_server_err llama_server_embedding(const char *json_req, ext_server_resp *resp) { -+ resp->json_resp = NULL; ++void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err) { ++ assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL); ++ *json_resp = NULL; ++ err->id = 0; ++ err->msg[0] = '\0'; + try { + const json body = json::parse(json_req); + json prompt; @@ -299,28 +325,29 @@ index 895f751..f939590 100644 + { + prompt = ""; + } -+ const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); -+ task_result result = llama.next_result(task_id); ++ const int task_id = llama->request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); ++ task_result result = llama->next_result(task_id); + std::string result_json = result.result_json.dump(); -+ const std::string::size_type size = result_json.size(); -+ resp->json_resp = new char[size + 1]; -+ memcpy(resp->json_resp, result_json.c_str(), size + 1); ++ const std::string::size_type size = result_json.size() + 1; ++ *json_resp = new char[size]; ++ snprintf(*json_resp, size, "%s", result_json.c_str()); + } catch (std::exception &e) { -+ return makeErr(1, e.what()); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "exception %s", e.what()); + } catch (...) { -+ return makeErr(1, "Unknown Exception during detokenize"); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "Unknown exception during embedding"); + } -+ return makeErr(0, ""); +} + +#endif // LLAMA_SERVER_LIBRARY \ No newline at end of file diff --git a/examples/server/server.h b/examples/server/server.h new file mode 100644 -index 0000000..4d03b1e +index 0000000..d22f1b6 --- /dev/null +++ b/examples/server/server.h -@@ -0,0 +1,83 @@ +@@ -0,0 +1,89 @@ +#if defined(LLAMA_SERVER_LIBRARY) +#ifndef LLAMA_SERVER_H +#define LLAMA_SERVER_H @@ -336,20 +363,23 @@ index 0000000..4d03b1e +extern "C" +{ +#endif -+ // TODO - clean the type def's up a bit for better consistency -+ typedef struct ext_server_err { -+ uint32_t code; // 0 on success, > 0 on error -+ char *err; // null if code == 0; else contains error message. Caller responsible for freeing memory -+ } ext_server_err; ++ typedef struct ext_server_resp { ++ int id; // < 0 on error ++ size_t msg_len; // caller must allocate msg and set msg_len ++ char *msg; ++ } ext_server_resp_t; + ++ // Allocated and freed by caller + typedef struct ext_server_lora_adapter { + char *adapter; + float scale; + struct ext_server_lora_adapter *next; -+ } ext_server_lora_adapter; ++ } ext_server_lora_adapter_t; ++ ++ // Allocated and freed by caller + typedef struct ext_server_params + { -+ char *model; ++ char *model; + uint32_t n_ctx; // text context, 0 = from model + uint32_t n_batch; // prompt processing maximum batch size + uint32_t n_threads; // number of threads to use for generation @@ -363,40 +393,43 @@ index 0000000..4d03b1e + bool use_mmap; // use mmap if possible + bool numa; // attempt optimizations that help on some NUMA systems + bool embedding; // get only sentence embedding -+ ext_server_lora_adapter* lora_adapters; -+ } ext_server_params; ++ ext_server_lora_adapter_t* lora_adapters; ++ char *mmproj; ++ } ext_server_params_t; + -+ // Initialize the server once per process -+ ext_server_err llama_server_init(ext_server_params *sparams); -+ -+ // Run the main loop -+ void llama_server_start(); -+ // Stop the main loop -+ void llama_server_stop(); -+ -+ typedef struct ext_task_result ++ typedef struct ext_server_task_result + { + int id; + bool stop; + bool error; -+ char* result_json; // caller responsible to free this memory -+ } ext_task_result; -+ -+ typedef struct ext_server_completion_resp { -+ int task_id; // < 0 on error, >= 0 on success -+ char *err; // null if task_id >= 0; else contains error message. Caller responsible for freeing memory -+ } ext_server_completion_resp; -+ ext_server_completion_resp llama_server_completion(const char *json_req); -+ ext_task_result llama_server_completion_next_result(const int task_id); -+ ext_server_err llama_server_completion_cancel(const int task_id); ++ char* json_resp; // null terminated, memory managed by ext_server ++ } ext_server_task_result_t; + -+ // Caller responsible for freeing json_resp -+ typedef struct ext_server_resp { -+ char *json_resp; // Caller responsible for freeing string -+ } ext_server_resp; -+ ext_server_err llama_server_tokenize(const char *json_req, ext_server_resp *resp); -+ ext_server_err llama_server_detokenize(const char *json_req, ext_server_resp *resp); -+ ext_server_err llama_server_embedding(const char *json_req, ext_server_resp *resp); ++ // Initialize the server once per process ++ // err->id = 0 for success and err->msg[0] = NULL ++ // err->id != 0 for failure, and err->msg contains error message ++ void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err); ++ ++ // Run the main loop, called once per init ++ void llama_server_start(); ++ // Stop the main loop and free up resources allocated in init and start. Init must be called again to reuse ++ void llama_server_stop(); ++ ++ // json_req null terminated string, memory managed by caller ++ // resp->id >= 0 on success (task ID) ++ // resp->id < 0 on error, and resp->msg contains error message ++ void llama_server_completion(const char *json_req, ext_server_resp_t *resp); ++ ++ // Caller must call llama_server_release_task_result to free resp->json_resp ++ void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *result); ++ void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err); ++ void llama_server_release_task_result(ext_server_task_result_t *result); ++ ++ // Caller must call llama_server_releaes_json_resp to free json_resp if err.id < 0 ++ void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err); ++ void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err); ++ void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err); ++ void llama_server_release_json_resp(char **json_resp); + +#ifdef __cplusplus +} @@ -406,10 +439,10 @@ index 0000000..4d03b1e +#endif // LLAMA_SERVER_LIBRARY \ No newline at end of file diff --git a/ggml-cuda.cu b/ggml-cuda.cu -index 85f7a29..ce51364 100644 +index 9e1acd3..ea64b55 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu -@@ -6410,6 +6410,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( +@@ -6505,6 +6505,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( CUDA_CHECK(cudaGetDevice(&id)); src_ptr = (char *) extra->data_device[id]; } else { diff --git a/llm/llama.go b/llm/llama.go index b3c57d47..26a0d588 100644 --- a/llm/llama.go +++ b/llm/llama.go @@ -3,6 +3,7 @@ package llm import ( "bytes" "context" + _ "embed" "errors" "fmt" "os" @@ -112,12 +113,6 @@ type ImageData struct { ID int `json:"id"` } -type llama struct { - api.Options - ImageData []ImageData - Running -} - var ( errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed") errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only") @@ -166,7 +161,8 @@ type prediction struct { } const maxBufferSize = 512 * format.KiloByte -const maxRetries = 6 +const maxRetries = 3 +const retryDelay = 1 * time.Second type PredictOpts struct { Prompt string diff --git a/llm/llm.go b/llm/llm.go index 41724d35..86dd3346 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -11,6 +11,7 @@ import ( "github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/format" + "github.com/jmorganca/ollama/gpu" ) type LLM interface { @@ -19,7 +20,6 @@ type LLM interface { Encode(context.Context, string) ([]int, error) Decode(context.Context, []int) (string, error) Close() - Ping(context.Context) error } func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) { @@ -78,5 +78,17 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) opts.NumGQA = 0 opts.RopeFrequencyBase = 0.0 opts.RopeFrequencyScale = 0.0 - return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts) + gpuInfo := gpu.GetGPUInfo() + switch gpuInfo.Driver { + case "ROCM": + return newRocmShimExtServer(model, adapters, projectors, ggml.NumLayers(), opts) + default: + // Rely on the built-in CUDA based server which will fall back to CPU + return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts) + } +} + +// Give any native cgo implementations an opportunity to initialize +func Init(workdir string) error { + return nativeInit(workdir) } diff --git a/llm/rocm_shim.c b/llm/rocm_shim.c new file mode 100644 index 00000000..9a6595b1 --- /dev/null +++ b/llm/rocm_shim.c @@ -0,0 +1,134 @@ +#include "rocm_shim.h" + +#include +#include + +#ifndef _WIN32 +#include +#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags) +#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym) +#define LOAD_ERR() dlerror() +#define UNLOAD_LIBRARY(handle) dlclose(handle) +#else +#include +#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib) +#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym) +#define UNLOAD_LIBRARY(handle) FreeLibrary(handle) +// TODO - refactor this with proper error message handling on windows +inline static char *LOAD_ERR() { + static char errbuf[8]; + snprintf(errbuf, 8, "0x%lx", GetLastError()); + return errbuf; +} +#endif + +void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, + ext_server_resp_t *err) { + int i = 0; + struct lookup { + char *s; + void **p; + } l[] = { + {"llama_server_init", (void *)&s->llama_server_init}, + {"llama_server_start", (void *)&s->llama_server_start}, + {"llama_server_stop", (void *)&s->llama_server_stop}, + {"llama_server_completion", (void *)&s->llama_server_completion}, + {"llama_server_completion_next_result", + (void *)&s->llama_server_completion_next_result}, + {"llama_server_completion_cancel", + (void *)&s->llama_server_completion_cancel}, + {"llama_server_release_task_result", + (void *)&s->llama_server_release_task_result}, + {"llama_server_tokenize", (void *)&s->llama_server_tokenize}, + {"llama_server_detokenize", (void *)&s->llama_server_detokenize}, + {"llama_server_embedding", (void *)&s->llama_server_embedding}, + {"llama_server_release_json_resp", + (void *)&s->llama_server_release_json_resp}, + {"", NULL}, + }; + + printf("Lazy loading %s library\n", libPath); + s->handle = LOAD_LIBRARY(libPath, RTLD_LAZY); + if (!s->handle) { + err->id = -1; + snprintf( + err->msg, err->msg_len, + "Unable to load rocm server library: %s (If you have a Radeon card, " + "did you install the ROCM libraries?)", + LOAD_ERR()); + return; + } + + for (i = 0; l[i].p != NULL; i++) { + *l[i].p = LOAD_SYMBOL(s->handle, l[i].s); + if (!l[i].p) { + UNLOAD_LIBRARY(s->handle); + err->id = -1; + snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s", + l[i].s, LOAD_ERR()); + return; + } + } +} + +inline void rocm_shim_llama_server_init(struct rocm_llama_server s, + ext_server_params_t *sparams, + ext_server_resp_t *err) { + s.llama_server_init(sparams, err); +} + +inline void rocm_shim_llama_server_start(struct rocm_llama_server s) { + s.llama_server_start(); +} + +inline void rocm_shim_llama_server_stop(struct rocm_llama_server s) { + s.llama_server_stop(); +} + +inline void rocm_shim_llama_server_completion(struct rocm_llama_server s, + const char *json_req, + ext_server_resp_t *resp) { + s.llama_server_completion(json_req, resp); +} + +inline void rocm_shim_llama_server_completion_next_result( + struct rocm_llama_server s, const int task_id, + ext_server_task_result_t *result) { + s.llama_server_completion_next_result(task_id, result); +} + +inline void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s, + const int task_id, + ext_server_resp_t *err) { + s.llama_server_completion_cancel(task_id, err); +} +inline void rocm_shim_llama_server_release_task_result( + struct rocm_llama_server s, ext_server_task_result_t *result) { + s.llama_server_release_task_result(result); +} + +inline void rocm_shim_llama_server_tokenize(struct rocm_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { + s.llama_server_tokenize(json_req, json_resp, err); +} + +inline void rocm_shim_llama_server_detokenize(struct rocm_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { + s.llama_server_detokenize(json_req, json_resp, err); +} + +inline void rocm_shim_llama_server_embedding(struct rocm_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { + s.llama_server_embedding(json_req, json_resp, err); +} + +inline void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s, + char **json_resp) { + s.llama_server_release_json_resp(json_resp); +} diff --git a/llm/rocm_shim.h b/llm/rocm_shim.h new file mode 100644 index 00000000..d11ed991 --- /dev/null +++ b/llm/rocm_shim.h @@ -0,0 +1,73 @@ +#include + +#include "server.h" + +#ifdef __cplusplus +extern "C" { +#endif +struct rocm_llama_server { + void *handle; + void (*llama_server_init)(ext_server_params_t *sparams, + ext_server_resp_t *err); + void (*llama_server_start)(); + void (*llama_server_stop)(); + void (*llama_server_completion)(const char *json_req, + ext_server_resp_t *resp); + void (*llama_server_completion_next_result)(const int task_id, + ext_server_task_result_t *result); + void (*llama_server_completion_cancel)(const int task_id, + ext_server_resp_t *err); + void (*llama_server_release_task_result)(ext_server_task_result_t *result); + void (*llama_server_tokenize)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_detokenize)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_embedding)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_release_json_resp)(char **json_resp); +}; + +void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, + ext_server_resp_t *err); + +// No good way to call C function pointers from Go so inline the indirection +void rocm_shim_llama_server_init(struct rocm_llama_server s, + ext_server_params_t *sparams, + ext_server_resp_t *err); + +void rocm_shim_llama_server_start(struct rocm_llama_server s); + +void rocm_shim_llama_server_stop(struct rocm_llama_server s); + +void rocm_shim_llama_server_completion(struct rocm_llama_server s, + const char *json_req, + ext_server_resp_t *resp); + +void rocm_shim_llama_server_completion_next_result( + struct rocm_llama_server s, const int task_id, + ext_server_task_result_t *result); + +void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s, + const int task_id, + ext_server_resp_t *err); + +void rocm_shim_llama_server_release_task_result( + struct rocm_llama_server s, ext_server_task_result_t *result); + +void rocm_shim_llama_server_tokenize(struct rocm_llama_server s, + const char *json_req, char **json_resp, + ext_server_resp_t *err); + +void rocm_shim_llama_server_detokenize(struct rocm_llama_server s, + const char *json_req, char **json_resp, + ext_server_resp_t *err); + +void rocm_shim_llama_server_embedding(struct rocm_llama_server s, + const char *json_req, char **json_resp, + ext_server_resp_t *err); +void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s, + char **json_resp); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go new file mode 100644 index 00000000..adf02108 --- /dev/null +++ b/llm/shim_darwin.go @@ -0,0 +1,18 @@ +package llm + +import ( + "fmt" + + "github.com/jmorganca/ollama/api" +) + +// no-op stubs for mac + +func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { + // should never happen... + return nil, fmt.Errorf("ROCM GPUs not supported on Mac") +} + +func nativeInit(workDir string) error { + return nil +} diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go new file mode 100644 index 00000000..0e7bcfae --- /dev/null +++ b/llm/shim_ext_server.go @@ -0,0 +1,212 @@ +//go:build !darwin + +package llm + +/* + +#include +#include "rocm_shim.h" + +*/ +import "C" +import ( + "context" + "embed" + "errors" + "fmt" + "io" + "io/fs" + "log" + "os" + "path/filepath" + "runtime" + "sync" + "unsafe" + + "github.com/jmorganca/ollama/api" +) + +//go:embed llama.cpp/gguf/build/*/lib/* +var libEmbed embed.FS + +var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported") +var NoShim = true + +type shimExtServer struct { + s C.struct_rocm_llama_server + options api.Options +} + +// Note: current implementation does not support concurrent instantiations +var shimMutex sync.Mutex +var llm *shimExtServer + +func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) { + C.rocm_shim_llama_server_init(llm.s, sparams, err) +} +func (llm *shimExtServer) llama_server_start() { + C.rocm_shim_llama_server_start(llm.s) +} +func (llm *shimExtServer) llama_server_stop() { + C.rocm_shim_llama_server_stop(llm.s) +} + +func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) { + C.rocm_shim_llama_server_completion(llm.s, json_req, resp) +} +func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) { + C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp) +} +func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) { + C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err) +} +func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) { + C.rocm_shim_llama_server_release_task_result(llm.s, result) +} + +func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { + C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err) +} +func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { + C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err) +} +func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { + C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err) +} +func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) { + C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp) +} + +func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { + if NoShim { + return nil, RocmShimMissing + } + log.Printf("Loading ROCM llm server") + if llm == nil { + return nil, fmt.Errorf("nativeInit wasnt called or libary load failed") + } + llm.options = opts + return newExtServer(llm, model, adapters, projectors, numLayers, opts) +} + +func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error { + return predict(llm, llm.options, ctx, pred, fn) +} + +func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { + return encode(llm, ctx, prompt) +} + +func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) { + return decode(llm, ctx, tokens) +} + +func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) { + return embedding(llm, ctx, input) +} + +func (llm *shimExtServer) Close() { + close(llm) +} + +func nativeInit(workdir string) error { + err := extractLib(workdir) + if err != nil { + if err == RocmShimMissing { + log.Printf("%s", err) + return nil + } + return err + } + + // Verify we have permissions - either running as root, or we have group access to the driver + fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666) + if err != nil { + if errors.Is(err, fs.ErrPermission) { + log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.") + return err + } else if errors.Is(err, fs.ErrNotExist) { + // expected behavior without a radeon card + return nil + } + + return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) + } + fd.Close() + + shimMutex.Lock() + defer shimMutex.Unlock() + if llm != nil { + return nil + } + var libName string + switch runtime.GOOS { + case "darwin": + // shouldn't happen + return nil + case "linux": + libName = "librocm_server.so" + case "windows": + libName = "rocm_server.dll" + default: + // shouldn't happen + return nil + } + libPath := C.CString(filepath.Join(workdir, libName)) + defer C.free(unsafe.Pointer(libPath)) + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + var srv C.struct_rocm_llama_server + C.rocm_shim_init(libPath, &srv, &resp) + if resp.id < 0 { + // TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm + // and run against CPU + return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg)) + } + llm = &shimExtServer{ + s: srv, + options: api.DefaultOptions(), + } + return nil +} + +func extractLib(workDir string) error { + files, err := fs.Glob(libEmbed, "llama.cpp/gguf/build/*/lib/*rocm_server*") + if err != nil || len(files) == 0 { + // this is expected, ollama may be compiled without shim library packed in + return RocmShimMissing + } + + if len(files) != 1 { + // Shouldn't happen, but just use the first one we find + log.Printf("WARNING: multiple rocm libraries detected - using %s", files[0]) + } + + srcFile, err := libEmbed.Open(files[0]) + if err != nil { + return fmt.Errorf("read ROCm shim %s: %v", files[0], err) + } + defer srcFile.Close() + if err := os.MkdirAll(workDir, 0o755); err != nil { + return fmt.Errorf("create ROCm shim temp dir %s: %v", workDir, err) + } + + destFile := filepath.Join(workDir, filepath.Base(files[0])) + + _, err = os.Stat(destFile) + switch { + case errors.Is(err, os.ErrNotExist): + destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) + if err != nil { + return fmt.Errorf("write ROCm shim %s: %v", files[0], err) + } + defer destFile.Close() + if _, err := io.Copy(destFile, srcFile); err != nil { + return fmt.Errorf("copy ROCm shim %s: %v", files[0], err) + } + case err != nil: + return fmt.Errorf("stat ROCm shim %s: %v", files[0], err) + } + NoShim = false + return nil +} diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 20b44bf7..06a2ae1c 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -8,7 +8,7 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version mkdir -p dist for TARGETARCH in amd64 arm64; do - docker buildx build --load --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH . + docker buildx build --load --progress=plain --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH . docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH docker rm builder-$TARGETARCH diff --git a/scripts/build_remote.py b/scripts/build_remote.py new file mode 100755 index 00000000..db824e4b --- /dev/null +++ b/scripts/build_remote.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +import subprocess +import sys +from urllib.parse import urlparse +from git import Repo + +# Helper script to be able to build on remote repos using git to push local changes +# (e.g. particularly helpful to target a remote windows build system) +# +# Typical windows remote git config looks like this: +# +#[remote "windows-pa"] +# url = jdoe@desktop-foo:C:/Users/Jdoe/code/ollama +# fetch = +refs/heads/*:refs/remotes/windows-pa/* +# uploadpack = powershell git upload-pack +# receivepack = powershell git receive-pack +# + +# TODO - add argpare and make this more configurable +# - force flag becomes optional +# - generate, build or test ... + +# Note: remote repo will need this run once: +# git config --local receive.denyCurrentBranch updateInstead +repo = Repo(".") + +# On linux, add links in /usr/local/bin to the go binaries to avoid needing this +# GoCmd = "/usr/local/go/bin/go" +GoCmd = "go" + +if repo.is_dirty(): + print("Tree is dirty. Commit your changes before running this script") + sys.exit(1) + +if len(sys.argv) != 2: + print("Please specify the remote name: " + ', '.join([r.name for r in repo.remotes])) + sys.exit(1) +remote_name = sys.argv[1] + +remote = {r.name: r for r in repo.remotes}[remote_name] +raw_url = list(remote.urls)[0] +url = urlparse(raw_url) +# Windows urls don't quite parse properly +if url.scheme == "" and url.netloc == "": + url = urlparse("ssh://" + raw_url) +print("URL: " + str(url)) +netloc = url.netloc.split(":")[0] +path = url.path +branch_name = repo.active_branch.name + +print("Force pushing content to remote...") +# Use with care given the force push +remote.push(force=True).raise_if_error() + +print("Ensuring correct branch checked out on remote via ssh...") +subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'git', 'checkout', branch_name]) + + +# TODO - add some hardening to try to figure out how to set up the path properly +# subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'env']) +# TODO - or consider paramiko maybe + +print("Performing generate") +subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'generate', './...']) + +print("Building") +subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.']) + diff --git a/scripts/setup_integration_tests.sh b/scripts/setup_integration_tests.sh index a1d01ac1..a8651bc0 100755 --- a/scripts/setup_integration_tests.sh +++ b/scripts/setup_integration_tests.sh @@ -32,4 +32,4 @@ for LAYER in $(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_M curl -L -C - --header "${ACCEPT_HEADER}" \ -o ${OLLAMA_MODELS}/blobs/${LAYER} \ ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${LAYER} -done \ No newline at end of file +done diff --git a/server/llm_test.go b/server/llm_test.go index 167c5831..ad0823f6 100644 --- a/server/llm_test.go +++ b/server/llm_test.go @@ -2,14 +2,17 @@ package server import ( "context" + "os" "strings" "sync" "testing" "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/jmorganca/ollama/api" + "github.com/jmorganca/ollama/llm" ) // TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server @@ -33,12 +36,16 @@ var ( } resp = [2]string{ "once upon a time", - "fourth thursday", + "united states thanksgiving", } ) func TestIntegrationSimpleOrcaMini(t *testing.T) { SkipIFNoTestData(t) + workDir, err := os.MkdirTemp("", "ollama") + require.NoError(t, err) + defer os.RemoveAll(workDir) + require.NoError(t, llm.Init(workDir)) ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) defer cancel() opts := api.DefaultOptions() @@ -56,7 +63,13 @@ func TestIntegrationSimpleOrcaMini(t *testing.T) { // get true concurrency working with n_parallel support in the backend func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { SkipIFNoTestData(t) + t.Skip("concurrent prediction on single runner not currently supported") + + workDir, err := os.MkdirTemp("", "ollama") + require.NoError(t, err) + defer os.RemoveAll(workDir) + require.NoError(t, llm.Init(workDir)) ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) defer cancel() opts := api.DefaultOptions() @@ -79,6 +92,10 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) { SkipIFNoTestData(t) + workDir, err := os.MkdirTemp("", "ollama") + require.NoError(t, err) + defer os.RemoveAll(workDir) + require.NoError(t, llm.Init(workDir)) ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) defer cancel() opts := api.DefaultOptions() @@ -87,6 +104,7 @@ func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) { var wg sync.WaitGroup wg.Add(len(req)) + t.Logf("Running %d concurrently", len(req)) for i := 0; i < len(req); i++ { go func(i int) { defer wg.Done() diff --git a/server/routes.go b/server/routes.go index 26a02cc1..75e67a72 100644 --- a/server/routes.go +++ b/server/routes.go @@ -25,6 +25,7 @@ import ( "github.com/gin-gonic/gin" "github.com/jmorganca/ollama/api" + "github.com/jmorganca/ollama/gpu" "github.com/jmorganca/ollama/llm" "github.com/jmorganca/ollama/parser" "github.com/jmorganca/ollama/version" @@ -81,20 +82,6 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess return nil, err } - ctx := c.Request.Context() - - // check if the loaded model is still running in a subprocess, in case something unexpected happened - if loaded.runner != nil { - if err := loaded.runner.Ping(ctx); err != nil { - log.Print("loaded llm process not responding, closing now") - // the subprocess is no longer running, so close it - loaded.runner.Close() - loaded.runner = nil - loaded.Model = nil - loaded.Options = nil - } - } - needLoad := loaded.runner == nil || // is there a model loaded? loaded.ModelPath != model.ModelPath || // has the base model changed? !reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed? @@ -905,9 +892,12 @@ func Serve(ln net.Listener) error { os.Exit(0) }() - if runtime.GOOS == "linux" { + if err := llm.Init(s.WorkDir); err != nil { + return fmt.Errorf("unable to initialize llm library %w", err) + } + if runtime.GOOS == "linux" { // TODO - windows too // check compatibility to log warnings - if _, err := llm.CheckVRAM(); err != nil { + if _, err := gpu.CheckVRAM(); err != nil { log.Print(err.Error()) } } From 89bbaafa64421e835c841435d0fdff94aa4152e7 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 18 Dec 2023 12:05:59 -0800 Subject: [PATCH 06/19] Build linux using ubuntu 20.04 This changes the container-based linux build to use an older Ubuntu distro to improve our compatibility matrix for older user machines --- Dockerfile.build | 41 ++++++++++++++++++++++++++----------- llm/llama.cpp/gen_common.sh | 9 +++++++- llm/rocm_shim.c | 14 +++++++++---- 3 files changed, 47 insertions(+), 17 deletions(-) diff --git a/Dockerfile.build b/Dockerfile.build index f1f5f3fc..5499b0a1 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -1,11 +1,18 @@ # Ubuntu 20.04 amd64 dependencies -FROM --platform=linux/amd64 nvidia/cuda:11.7.1-devel-ubuntu22.04 AS base-amd64 +FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64 +ARG CUDA_VERSION=11.3.1-1 +ARG CMAKE_VERSION=3.22.1 # ROCm only supports amd64 ARG ROCM_VERSION=5.7 + # Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html RUN apt-get update && \ - apt-get install -y wget && \ - wget "https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \ + apt-get install -y wget gnupg && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \ + mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ + echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" > /etc/apt/sources.list.d/cuda.list && \ + wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \ chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \ mkdir --parents --mode=0755 /etc/apt/keyrings && \ wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \ @@ -14,35 +21,45 @@ RUN apt-get update && \ echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \ echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \ apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install -y rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev + DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev ENV ROCM_PATH=/opt/rocm # Ubuntu 22.04 arm64 dependencies -FROM --platform=linux/arm64 nvidia/cuda:11.7.1-devel-ubuntu22.04 AS base-arm64 +FROM --platform=linux/arm64 ubuntu:20.04 AS base-arm64 +ARG CUDA_VERSION=11.3.1-1 +ARG CMAKE_VERSION=3.27.6 RUN apt-get update && \ - apt-get install -y wget && \ - wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \ - chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr + apt-get install -y wget gnupg && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-ubuntu2004.pin && \ + mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa//3bf863cc.pub && \ + echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/ /" > /etc/apt/sources.list.d/cuda.list && \ + wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \ + chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \ + apt-get update && \ + apt-cache madison cuda && \ + DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} FROM base-${TARGETARCH} ARG TARGETARCH ARG GOFLAGS="'-ldflags -w -s'" ARG CGO_CFLAGS ARG CLBLAST_VER=1.6.1 +ARG GOLANG_VERSION=1.21.3 # Common toolchain RUN apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-11 g++-11 cpp-11 git ocl-icd-opencl-dev && \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 --slave /usr/bin/g++ g++ /usr/bin/g++-11 --slave /usr/bin/gcov gcov /usr/bin/gcov-11 + DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10 # CLBlast RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \ cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install # install go -ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz -RUN mkdir -p /usr/local && tar xz -C /usr/local #include -#ifndef _WIN32 +#ifdef __linux__ #include -#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags) +#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags | RTLD_DEEPBIND) #define LOAD_SYMBOL(handle, sym) dlsym(handle, sym) #define LOAD_ERR() dlerror() #define UNLOAD_LIBRARY(handle) dlclose(handle) -#else +#elif _WIN32 #include #define LOAD_LIBRARY(lib, flags) LoadLibrary(lib) #define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym) @@ -20,6 +20,12 @@ inline static char *LOAD_ERR() { snprintf(errbuf, 8, "0x%lx", GetLastError()); return errbuf; } +#else +#include +#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags) +#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym) +#define LOAD_ERR() dlerror() +#define UNLOAD_LIBRARY(handle) dlclose(handle) #endif void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, @@ -48,7 +54,7 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, }; printf("Lazy loading %s library\n", libPath); - s->handle = LOAD_LIBRARY(libPath, RTLD_LAZY); + s->handle = LOAD_LIBRARY(libPath, RTLD_NOW); if (!s->handle) { err->id = -1; snprintf( From 9adca7f71128c09759fd0bc0b9a146f4e79fe935 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 14 Dec 2023 10:25:12 -0800 Subject: [PATCH 07/19] Bump llama.cpp to b1662 and set n_parallel=1 --- llm/ext_server.go | 2 +- llm/llama.cpp/gguf | 2 +- .../0001-Expose-callable-API-for-server.patch | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llm/ext_server.go b/llm/ext_server.go index bd026043..ded424a9 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -160,7 +160,7 @@ func newExtServer(server extServer, model string, adapters, projectors []string, sparams.n_batch = C.uint(opts.NumBatch) sparams.n_gpu_layers = C.int(numGPU) sparams.main_gpu = C.int(opts.MainGPU) - sparams.n_parallel = 2 // TODO - wire up concurrency + sparams.n_parallel = 1 // TODO - wire up concurrency // Always use the value encoded in the model sparams.rope_freq_base = 0.0 diff --git a/llm/llama.cpp/gguf b/llm/llama.cpp/gguf index a7aee47b..328b83de 160000 --- a/llm/llama.cpp/gguf +++ b/llm/llama.cpp/gguf @@ -1 +1 @@ -Subproject commit a7aee47b98e45539d491071b25778b833b77e387 +Subproject commit 328b83de23b33240e28f4e74900d1d06726f5eb1 diff --git a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch index 623243d4..2e5a981e 100644 --- a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch +++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch @@ -1,4 +1,4 @@ -From 087cf3300e973d7790db8f7cad01d2a790de38be Mon Sep 17 00:00:00 2001 +From b5e195803e2a989e57eef0010adce778df1e2d01 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 13 Nov 2023 12:25:58 -0800 Subject: [PATCH] Expose callable API for server @@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644 +endif() \ No newline at end of file diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index d0cd8e1..5f5d4c5 100644 +index 0403853..2084fd8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,6 +5,9 @@ @@ -59,15 +59,15 @@ index d0cd8e1..5f5d4c5 100644 #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error -@@ -2632,6 +2635,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con +@@ -2643,6 +2646,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con } } +#ifndef LLAMA_SERVER_LIBRARY int main(int argc, char **argv) { - // own arguments required by this example -@@ -3066,3 +3070,273 @@ int main(int argc, char **argv) + #if SERVER_VERBOSE != 1 +@@ -3123,3 +3127,273 @@ int main(int argc, char **argv) llama_backend_free(); return 0; } @@ -439,10 +439,10 @@ index 0000000..d22f1b6 +#endif // LLAMA_SERVER_LIBRARY \ No newline at end of file diff --git a/ggml-cuda.cu b/ggml-cuda.cu -index 9e1acd3..ea64b55 100644 +index f20846f..9640cf3 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu -@@ -6505,6 +6505,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( +@@ -6757,6 +6757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( CUDA_CHECK(cudaGetDevice(&id)); src_ptr = (char *) extra->data_device[id]; } else { From 51082535e17fd7480aed24772413167bf0512b80 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 13 Dec 2023 14:29:09 -0800 Subject: [PATCH 08/19] Add automated test for multimodal A simple test case that verifies llava:7b can read text in an image --- scripts/setup_integration_tests.sh | 39 ++- server/llm_image_test.go | 542 +++++++++++++++++++++++++++++ server/llm_utils_test.go | 10 +- 3 files changed, 566 insertions(+), 25 deletions(-) create mode 100644 server/llm_image_test.go diff --git a/scripts/setup_integration_tests.sh b/scripts/setup_integration_tests.sh index a8651bc0..8328d039 100755 --- a/scripts/setup_integration_tests.sh +++ b/scripts/setup_integration_tests.sh @@ -9,27 +9,30 @@ REPO=$(dirname $0)/../ export OLLAMA_MODELS=${REPO}/test_data/models REGISTRY_SCHEME=https REGISTRY=registry.ollama.ai -TEST_MODEL=library/orca-mini -TEST_MODEL_TAG=latest +TEST_MODELS=("library/orca-mini:latest" "library/llava:7b") ACCEPT_HEADER="Accept: application/vnd.docker.distribution.manifest.v2+json" -mkdir -p ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/ -mkdir -p ${OLLAMA_MODELS}/blobs/ +for model in ${TEST_MODELS[@]}; do + TEST_MODEL=$(echo ${model} | cut -f1 -d:) + TEST_MODEL_TAG=$(echo ${model} | cut -f2 -d:) + mkdir -p ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/ + mkdir -p ${OLLAMA_MODELS}/blobs/ -echo "Pulling manifest for ${TEST_MODEL}:${TEST_MODEL_TAG}" -curl -s --header "${ACCEPT_HEADER}" \ - -o ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} \ - ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/manifests/${TEST_MODEL_TAG} + echo "Pulling manifest for ${TEST_MODEL}:${TEST_MODEL_TAG}" + curl -s --header "${ACCEPT_HEADER}" \ + -o ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} \ + ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/manifests/${TEST_MODEL_TAG} -CFG_HASH=$(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} | jq -r ".config.digest") -echo "Pulling config blob ${CFG_HASH}" -curl -L -C - --header "${ACCEPT_HEADER}" \ - -o ${OLLAMA_MODELS}/blobs/${CFG_HASH} \ - ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${CFG_HASH} - -for LAYER in $(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} | jq -r ".layers[].digest" ) ; do - echo "Pulling blob ${LAYER}" + CFG_HASH=$(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} | jq -r ".config.digest") + echo "Pulling config blob ${CFG_HASH}" curl -L -C - --header "${ACCEPT_HEADER}" \ - -o ${OLLAMA_MODELS}/blobs/${LAYER} \ - ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${LAYER} + -o ${OLLAMA_MODELS}/blobs/${CFG_HASH} \ + ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${CFG_HASH} + + for LAYER in $(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} | jq -r ".layers[].digest" ) ; do + echo "Pulling blob ${LAYER}" + curl -L -C - --header "${ACCEPT_HEADER}" \ + -o ${OLLAMA_MODELS}/blobs/${LAYER} \ + ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${LAYER} + done done diff --git a/server/llm_image_test.go b/server/llm_image_test.go new file mode 100644 index 00000000..8e92590d --- /dev/null +++ b/server/llm_image_test.go @@ -0,0 +1,542 @@ +package server + +import ( + "context" + "encoding/base64" + "log" + "os" + "strings" + "testing" + "time" + + "github.com/jmorganca/ollama/api" + "github.com/jmorganca/ollama/llm" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestIntegrationMultimodal(t *testing.T) { + SkipIFNoTestData(t) + image, err := base64.StdEncoding.DecodeString(imageEncoding) + require.NoError(t, err) + req := api.GenerateRequest{ + Model: "llava:7b", + Prompt: "what does the text in this image say?", + Options: map[string]interface{}{}, + Images: []api.ImageData{ + image, + }, + } + resp := "the ollamas" + workDir, err := os.MkdirTemp("", "ollama") + require.NoError(t, err) + defer os.RemoveAll(workDir) + require.NoError(t, llm.Init(workDir)) + ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) + defer cancel() + opts := api.DefaultOptions() + opts.Seed = 42 + opts.Temperature = 0.0 + model, llmRunner := PrepareModelForPrompts(t, req.Model, opts) + defer llmRunner.Close() + response := OneShotPromptResponse(t, ctx, req, model, llmRunner) + log.Print(response) + assert.Contains(t, strings.ToLower(response), resp) +} + +const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb +AAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAABIAAAAAQAAAEgAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAANKgAwAEAAAAAQAA +AHgAAAAAXdsepgAAAAlwSFlzAAALEwAACxMBAJqcGAAAAVlpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6 +bWV0YS8iIHg6eG1wdGs9IlhNUCBDb3JlIDYuMC4wIj4KICAgPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1z +eW50YXgtbnMjIj4KICAgICAgPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIKICAgICAgICAgICAgeG1sbnM6dGlmZj0iaHR0cDovL25zLmFkb2JlLmNv +bS90aWZmLzEuMC8iPgogICAgICAgICA8dGlmZjpPcmllbnRhdGlvbj4xPC90aWZmOk9yaWVudGF0aW9uPgogICAgICA8L3JkZjpEZXNjcmlwdGlvbj4KICAg +PC9yZGY6UkRGPgo8L3g6eG1wbWV0YT4KGV7hBwAAQABJREFUeAGE3QfgX9P5OP6TIRKRncgmS6aR2DNCKEKLqqpRW9FWq0q1dEQparZKF7VK7aq99yZGSCRB +BhErk0Qmyf95nZOTfOqrv/9J7ud977nnPPt5zrz3Ntp0s61XrLnmmql58+Zp6dKlqUWLFmnZsmXp888/Tx07dkwLFy5MX3zxRT4aNWqUmjVrlho3bpzatGmT +Pvnkk5y/YsWKXHfttdfOv/VauSZNmuRj0aJFSX15cIAPruS3adOmafny5Uld5dDkXP05c+akTp06pTXWWCN99tlnacmSJQGnUVp77VbpvffeS126dM4wli4t +dK8RsJoHDvUXL16cy7du3TrjXrBgQS675prNUsu1WgV/AW/ZktSxQ4dMC37BXbDgs7Q4aG7cpHFq2bJlpo984EY/3vELB94k+eqjU36V1fz580OmSyO/WZZt +8+Zr5jKu8YZv8pTgkCoMcnCgm17atm2bz+Gv8NWnvxUrlgd9S3P+4sWLQnZNc91PP/0ktWrVOst19uzZwc9akd98lczxN3fu3FwPLudrtwrelqcsM7LG95rN +Qv4LF2U6XLvfvMWaq2gi90ahX2mttdbK5ej2o48+ymXokv7Ri/ZPP/00LQ16O3bqmOuwCbiaNSv8Ngs5fhFl2QPe1fXLBtgLutHrVyJnciffZWELS0KWytEL +Odd66oDjHrnjpdoiGTbyL3DRAX3AT77xEzAW5nrwuY9m/DTp3bvf6Hbt2oWgW2WC3ARYZQdA8+bNW2UYiILU4T6FIsw1w0NAYaZ5RoT4KgRIwa8GgBgEEjC4 +DFJdB9jynTNYDqF+pQdDyqw23ma5nGv1MIcnuAgMHPfQWuholtKKlNaEP2heujQMYyVuTrT8i+VpUeCsNFIEueAFDWBSXD1nOO7PmjU7nK9J+uLzkE/AnRnX +yi5atDgbcMsoN3/+Z2nK1PfS2i1bxL0mmQ+OXmlEO4fEX4eOHTJORiefPNdYoxiR8nTHwCR8f/EFY8T/iqyThjyjkdHBRdbkIMGFdrLiqIx5/vwFaY2ma+R7 +1UA5M0OjM7Dw59x9sPANDn47dGgfZVOmPSOJP2RF/+5LfjmsX/ckcqp0gkfv+GQDZF9tjyyc+yUbNLjmGHPmzE0LQk6u8Yov5zUYu0YvPGRGFpmfkDd+QvAZ +F9jwg7F8+RfB29KcX+WMbvxKTfoPGDQ6HC2nShjBKuwXg126dMkKwBAiOA/CCRYBkAHaKhBSvnodIsKrywDBpVCplnWubFWSX+UZP1jKFYK/yPgqXLDQQyFw +Y1Id5THVPBxl5qxZWfBgEgZ6CLdJtC5oBrd5i+ZRNoQWPM1fMD8bIyNcGBEXn40bRUQKXhktOASMdzRSgoNTukbbhx/OjOtmqVevnql9GHe3bl1DZi2Cjpap +e/duaZ11OoXzvJsWhzI6d+6Yhg/fOk17590MFz7w8A0Pep2DvzgMC72Zt7in3DrrrBM8r53pgrsamJZEvWoUZAU2OLWMewyPQ+KHE+LBr7qff74sG7M6Ak1U +z62yenBXfJ9FsGkaLR5HoAt6qLjAw0MNouo64ENTTZwWTDaCR85SaCgtkxYV33SmnFTpJidlHXQPPidaFHjR4T6a3NNCCSBgKM9e8Fdhocu5+5wK7ehUFr8f +f/xxBL3S25LvkO+Qcrldd/v6imIcy+JG41WMtm/fPjMHISF/8P77YXALMnEAIFbkEvkqUADlI0pSFyMEDXltip0zTvkExckWMNaVzgaeesoQLmPW3arOUxlm +OIRVIzI+aotBMeoTrnx4wMQXfGhv0rhprvtFRBtOMC/gaYWaN2+R+dK1+DycS3k0zZz5cZQvRt0BnFAeJc+aPTftsvMO6eennJwVWmRTWgmGKJqhffr099LR +3/t+uvKKv6W+ffumu++5N+2z37Fpj123TLNmzkyd1umcHR9f8FG4rqdgwHnwQNG1C4vH6mRVT4xCGfjcw7trMip8N849DDDJrtZniM7xQz8McUG0SuS+NLq+ +5Coo0Lcya0b3q0uXrmFEjdMnK1tLAbYaL9lrAeCuhkf2nBgs5dgJWeFVYh/oZch4rc7iGr01YMqvOleX3XFK+iU79kEOeFLPffck53A40AFmlQ/+lXeNVvfR +Cwd86tb6aNA6fx49D3LNbawKGMcI711rrZYZGCYh5JGQUI6EQIDdg7h6dEOi5akPsaQ8BolMs+saXr9gtwyHIVhEKYdQTGICHMpQlkDeD6emCHQU41oYDtM2 +160wlCcMNOJLFwhNaJTAnzN7Tnacxk0apQ8+CIFFfoeOneKvrkTrTN/cuXMyfjQZ04DHOVvHQcFahsefHp+O+V7vaGk6A/0/U+9evdK222wVrVW3XGZA//VT +9y5tomWakV59+ZnUfO0eaY/dts+8MUo8zA4nHfvqi9Eh7x79pPfSVlvvkLp27Rz5c7KclCM/vEnkRYbyyBe/8hg/OZAhuc6KVptcyQ9PeHEfTvkSmS0LvgUz +9+NGLqMcvLPn6LYW54M/yyX0AoZruoIPbnYwM4KFfE5vuCDRAxrkf77SDhly5YHNKYMH+pTQxyblK8d58PTZZ9EdjfLKgk8GyqAHTOd+yQU+/KFNK5wDRshB +HQHAWJJ9tY8u6lotip2xAXXBwYNrrSacTQm6fft2uZIbCONUkGNeswspJhDIUAkVEgw5KAIw5xA5RyRBggGmOqIruBwVnEqMFkekd28ZZqKOuu6DRdBoqwZB +mNVp4Q7zyTQTJhjKoo/Q5FV60MYJCYLQFy1cnAezTVY0zhG2jkeaNFkjfRKKUL9ROJl6eKs8wl0VCd+2W/ZP199wSx5Xde68TuZ39913y3Jj8HfffXemY8xL +L6d33p2+ypnRPueTxenHxx8VrdkJacqUqenKq65PHdq3ztH//odfSDuP2DRdfPGf8phDj+C5515Izzz3Sho8sE+aMeP9rBfKZ7DgodU5eaOf/J37JdOqC2Xc +x0s98AhWNXaBY01jreVF9sZEJjEWL14SjhRjthhHduzUYZUDkgVc4Ah04DvneA734FcOrRy04qTTpStth5wrP3TuUKfaolYCjeq7x07c0+XnANVuODY7U7d/ +//5RZvZK+2yWJ0DkC5r40c0nB3Q50EVmi6Krr4vLJ9hVjx49Mgw0uCZv+Brt8839c9eOsarsJgG46Rpws3cIQjxlOK9NX0NGCUOSRxgSj2e46kJeiC9llEOs +svKrUNFAobWsusqgi4O4B9aSJYuzMEUFjFa60WywbHaKQ+uOEOr8+TFLFJMKZoWUb8J5o2yZ4SoGBHaTiLJpRaPc314UhiOBAzchi3auK83odr502fL0wnOP +pf2+fWC65por8njt3XCc9dZbN3XtPjB9MGNKOurow9Mf/3BhhvX66+NiZmlJ2mzTTTMOfx599LH03UOOC8dpm/b/9l7puOOOybhqAfhv+8/t6fCjT047bjc0 +ZtEEqIURzUv/f3l0N4xPi9HqfpQILmqThyCGVrJirGTRIsaL9MDQ/CpDBytCbmYttcqSmT7BsM4GNo3JCF1kxkTHuqfkSTYcRyKrqj92U4JYCaLkpuyCGKN+ ++un8fF51TIdsEN3orLYCpm4cmLNnzwrcZbxKN2wEPvTArw6cyreLY8rUqbm1gZfjVRzV/ti2AMAG2K18ZeUL9mTJWefNm5umTXsn+4BGSCBv0q/fgNGEvmYQ +9nkIGIGYAQzTiKnRQblqyBDJZ6AShBAjrrYgZvGygYXy1VOe4MB1TlDV+8EDSz44tVvmPlrANIXMQQgLvqKg0q81roGLcpct/SK1DVjRXoZBLItAEN21EIKx +SnXmFs2j/7xC/zYmHYIegs+RJcaJxkaMjlHBj3a4yAKdzhkrXuGkODR2aN82JlzapoED1k+7fm2XXF/5F154LQ0Z3C+1DmV2jan6UaN2z/cooVvXrlneYEq9 +e/eKaPl+8Ls0XXDB77Niyf2ll14K2TTJRrHBkCEpZp3T3fc9HBMbrbKC0fDZgtJ9IadyLItfA/fSvwe/ZQyaa9fOAJrDcIZPPpmX+cGHvLlz52V+Ca7qiuzw +TS7krx4jIxeHGVCtBHmSjXK1LJ3Kd78Etfmruk/oAkdZuMkUHjDlfxF5einqu4dhY1nd02qH9PRZyJoeq/3Jq/b0/gcfZD1VfcFJZuQOJ3rhq/erbvkCvsEB +b/r06VG+TJigV7lP5n2SGkOqn4tQwnGt+eXFy8IIeTRiJcAoAUMEXg0cMkAJqEYAMIx7uoahmMVCbG3uFy2K/nYkeZVRsCRlGLQmWpJPmHDoWoBLGcpwjI8+ +mpnvK2sw3DrGLB07ts+O0CzWPXRPPo3+fBZ08AKe+nhep9M6Ofo2DgESCD7jNNOs5ZKnbBWuuvhfK2jQunFowkcTmVDu4sUxuI/fhmnhwiURyRdlWrUYWkjp +i+ganXHGWWmXXfZKb7/99qoq667bM+277zeyA8u8/vob09Zbb51+ceovV8ll1113SdMmvxN4W+RybVq3CZ21Cf60MsYrbbOMBC50043Wh34YBjrmBv0mFIx3 +QvVZH/ihE7Dw7aAn+WDRBXj0LcDg28Fu/AqA5KGco8qQ3MAgszJWKt1/QYLc6VMib06kxVCfY5jUAb/aoVlZa1NsxX1OiiaOXINsDW5owUPRXZkVxB9aqk2Y +6ZOnDhx4c0gtAqZxMDs2BjZ+AqvaLR3SZlMZmNBciYIMliIgInSVEMLJKAPjFIFASCuzBFaZAINAwHFUBzWuUB9RYCqHeAqoc/yUprw858rVFkpdNHEQXQGt +RvtoBfDw5ptvp6nT56Z2rddOc2YtjO5U+9R/wHphEK1j0W9ZsFq6m1qoYC1wl1m8tQJGs+DfDMyKFWumiZMmp5dfnRRO1jr16NYl06sV1D1jDOPfeCONe218 +GrbpJhEgtKAMrwQBvHaKxUXOiwfJDyMAv8xwWmcrEx4zZryXrrvuP+FEL6exY19P/fr1y3XQ16vXevmcXA866ID03e8elGVB7hJ5RRubloSxrR2LrYsbi+gW +CGOdK1okk0Z0R+aMgp7o1DoNZzMm0FWzcLl2q9LdW7rU5EBpkeCNdibLnnx1f8kQDPxUmGyHDuXrLtORBK+ZRvTW8YV6nJY+S8Ashk/XDjCUn/7uu3mSg6Oy +I/iVh6caOX7A40jyXYMBNtrpynKGQysMtrLsBw3KrHZCOomJpnBgSZliD9HafFp6SvLAltDEFyrPaG7KKx26AISCeATKQ0x1JERWJ6IkTiAPMcozIr+QMX7n +fhGgm0FpEkEQrHsIAQMhDtcEUnHDBy6m9ZUJQDkK7dmzR5o8eWoaP+7ltOHGm6cRI7ZLh0Ykx2AR7JIY2L+bXhzzahr7ynNp8ODNUt9+6wbesvsBHC0j/Mp/ +GgJ74vGx6YfHH5jWW3fdcJgJ6aorb0t77Dk8RyKCffTxZ9NmwwanQ797YJ55/Nf1t0YLqEtYAoaIXVrIsosjMxzxau7c+alXr245AOCxJkbbrt3acVnWxGo+ +pTIeCe8ffvhh/JoIaFxakzh/4YUXU5uI/vRD1mRjXOcaDtcmBJyDoYdBvyZD6GzKlCnZmTikWUs4tNKClPILY8HbTJZAoUVFEz7hokPw4BBMGTkHhs89MrV2 +VoMXOdM3e1JfkGEnaEOva7Bck3ObgE0/bEEwdbADdgGf8nhRto6hXCsPHv4ki/bsAU26rmy24mTnxQbKfIAewxwzdVG30FS6w/yCDaMbz/jgSGChh87ByY6E +KYUQ7KaCEGIOQsS7lgijElwiVYl0kClXBaSM+5QCudaOA8lz3WZlF87qtTJaOQLSpDNszX+NGNUQCMU5g7rj9mfSYYfvkc79/Zlpww03yBGaAhsmszEGpK+P +G5/+c9sd6W9/uzTt881vZzrnxAAaLzNmzFjZ0i5JDz10Qxq50070n2Wx225fS78947w0ZFC/9MRTY9KJP/5e+v73j4t6jD+lb++3b/rBD08IesvYEg9zYmzR +Irpbq1MEizXLDoGWa7WI3QKly+A+Q6C0xo17hHxX16B4rbtkQuSyy65IZ511Rr7+xS9OS9/61jfT25Mnh6xjRi4rNGbqYmxXDDQWciPQ6faC0yKmbhk62hwG +y7qtdYZLK9Z0jTJV3ry58ULp/zcLp6GvttHq0gPZ0jGj0X2Diy7pSjl8WFvT/WZDtWXjoGyHPay1Vo8sc3aiDON0D4w8vgm7Y/xwyBNIlYHfNZ7YDpvUerDP +du3a5zzyEuDVz3Jb6VCl3vIsB7jAZEN4QTP4aHPPssziuMafa/6AFrzCyXlMvKknHy3KuN+0eLaoWebiOZFmnEBVwHStBHjDFgcRjBFAZY1RdGsQhlhJeUpw +1HP1ssOF0DlOxSNPPTQxAgnjEsGbGFi0aFma+s6M9O9//zntFlPM+rANE6YktBJs9+7d8zFypx3TPvt8I536y9NTq6BzrYA1PwyrWRj5gw88l84886Q0cmQ4 +UST8wP/NffZO9933QJow4c3Us0fndNDBB2Yncp8RDR48OB1//PfTkd/7WfrayC2CtsUxydE68wq/JKK3a98mR7rPYrq9UJdvxR9dLVPQZdW+5goYN998W/r6 +1/fMeE466cS0Taw/tQ7YW225ZZadRdBzf3922njDARGtSzeubZt20RX5JH0a4zfbe6o8yZjhiOrkWQykDPzJ2oIr3ZmN03rQIUdYKxxfeVEXz8rSN13oujHw +teNgfORhskonl2Mpo2xprcpY2EBdQhca1KEvcMkL37pinMF9ToDmsj6k1V8z4JWxW7VX8MCBR1l2qx6YbBnf7rM/tuy63hOIBJ08Oxll8INuvKJLWXToorJL +dg0vWsGQlAG3KaYRgGnIGLnCBqsAyqvNF68HkHEAZkqREAGyh8zOBEyAVR2pwlQmYnCup65rDmqMkreaBNPqYsCBDjDkgSEaGat89NGsdP21l6ehQzfOjKAR +PId6NdVz+eBQ3q67fi0ZyB9w0OF5wgDudu1iKjVw9+vXN1dVlmOrr86QwYPSVdfenw7af6fciipUDQDs9QJeixamYmOPXRifCD79vRmZbnLlXNOmfhzdyvXC +OdcIA4wp4qBZophlMWUeYSLv0cuZ8Uekve66q9JOOw1PBx98UKZxjz1G1dsxppqc/nzp39LQYVtmh9faM76msf4FJifS1VqwQCtQornAoKvml/Lfi/FZm1Ym +J5pmWkXc6mTGFC1bakVjEimMjTGxAXzTB+eXqozlg8sIya4amfuMk42BQV5+ydhvNXD0wA82GrRO8LIPZeALVWS4yrIH9euEFD3BoxzcYKlLFsoJJGwSjVri +alf2VFb7oSv0g48OTs0R4cKHxkV9B9tUDhz1+UMO+5orGbWw8QxCAFIYQSIFQh2aVdEBQkwoR4BgEBqB1Xx1IRUxCEn3ojKKeIQp656y8givGrKyYIP50IN3 +xoLkc9mJqgCq0bvv+HICRxl8OB80aFC65qrL0l77HJw22rB/jnC9e9p9vLolU67Cyr/LpmcF1Tz3azJ2+WD6+LTxkN6xhUrXp3lMWLyd/vKXv6Utt9wit2i9 ++nTM24+sybz99ox09dX/TBtvvFG66aZbQlHNwmGGpWuvuyFosR1nWbrkkr+H0++ZTj3t7DRmzMsxqzcyxg1t8lrRxEmTot5tQU+z2CHROesDb02DRw5ovOPa +NiLBUKvCyMjXWFV0V66l9aQoZzeBpHx1BK3SsmXR5QuYHcI2rNeRoW1cur261mRQ5UC/dOZgN+TENhgclTQPWuNPtiEBl4x0AU0YsSfGTp/qwtM07IFNsA3B +29JM7daBif6Kx84D25U+iOlt8kMgG1QOzxyITYHPKdCmvsaCbbFL58qWGcfSc0ITpwQLv1pp8gEXjWCBbfYw0yoDEkbMCwHgKBAi1lw9obgGQB6BAapp1Epw +JMoSIRAmAQ4uYajrQJh6jbRoMTCmoOVflIVaAmXMy5aVRzWMQ0TZ4gDL0yMPv5j+9vfLwji3zApEA5juO/f7xhsT8jy/fJFngw2GZPy1DMU732ijDdMfL/pd ++u2Z54WQ10iTJryUZ7oy4V/6Q6kpdcxw6i3wqxE5l2zYDcayUrp1XSedfPJPa/H/83v88S/nvH79hqY+fbpnFzYm3Guvb+T8HUbskneHR0OT/nLlneGUl6yC +0bZD79Sze8f0+muxbahB2mCDLVLnLmUvGx3SyaSJ74aBTVxZyjrV0jRw8LA0aOD6eVdEcBFT9aV1oRvGQm4c46VX3kgz3n0jdV93SN5ou07HNmnC+EkBY35a +f+AmqX+0sMZ4JhgkemYfDM+5NbKPP56VHnv69TS4/7pRYkV6d/qHYR9rpSlvjc11OnXpG3B65qBA14yULhkoudaWgR1J1TZ1K9HLds06OtiblpnDgWFJgwwk +sPAEnpYaLvrjPPTPpt1Du1/1wGf7tZHRg6o8wosWh/JsqqnoYDoaIsqEFFEQTY4BLa/lWPJU4M2coRhnWZMBWFK2JkTUaIDQ99//IE8hEzanUV+yaKpcZj4E +IAKpq1+KIUJD09Bh/dPeKw0NbdV5/Kpzzjnnpt/HmCGlWC9Zu11aGq3DKT//XvrpiT/OExGF3jITBu+IEcPTn6PVgEsyWP+qZMtSSrNW0ftVZeSRH8W0iXHM +A/c/lTbbfNt0wHf2TUNi8ZRxahl0/QhewHnzzTfTLbfcnu655z9p1932zDTusedeWe6vjp2UDjn4W2m/mMwwdpk6dVq6NLpyAtBLL4+P8dLm6bxzz8w7zPH1 +0Ucfp3/968Zo6f4R48Y9s0E9+khMjpx4ZLRsF2dZhghz9LzvvgfSRRddkLbbbqcsQy0M+fbs2TPjfuyJ59O2W2+Sfn3aT3LrbT0J7crQ28yZs9Jrr72WHnzo +8fTiS+PTTiO2yjJkF1XfAuyDj76Yvr779unIIw9JA/r3j8ks61hlWUBZOwOee+75dN55v0/rD9g49e2zbgTzsimXExj4M9xqF87h5wjsCQ52SOZ01zLkxBGq +ntkkx9StY5d1AkH56mD0pqdlLU4DAB4erBMasqjHifWiJDJwuJbvXBk4I+A2y1EUQkoGCHAFDdQV4o1meiw+IRAxEHMuwlFPl8F99eRhDOOQYZwgOUD1eoTJ +V0ZrqJ/5+edlAyziXINlXeb+++7KXaU66CPc6kxwn3HG79KFF54fU9V7Rb1irMpc/c9b0+x4Hujiiy/MuNRBD57Qf9CB+6efnnJGTJ9vmfPR9OWkrLRgQXRr +ViZwakKjhGbdpclT3o2u2Vkxs7bv/3MT67bbbpO++c1v5n14Z//+j6ldGwuPAs5Hadddtk+//vVpmWaw+4ch0os6V199TTrwwAOykblX0447jojWerM8qzhs +k63Sww/fEl3GHevtVb+77LJLsg/wlJ+Pjoma6JZHy89ILTC/9vqkdM7vTk3GY3on/yuNGLFD0HBgdDFvTr86/YK07VYbZZmGaLMu581blq676o8JTXoqX5U2 +2WSTmPzZOx1++KHpoj/8Kf3njgfToGi5LGbTnYDLvhgtWuiBvdA3mRtvLlpUNloXfazI9lqm8cu6mTqm9+lcAo/9sk+tCccJ98g26b7yDk4ER7UV5dm8Vqra +EPrITTn3m0Jcu3I8WFPHudhKbc4A5ySSMtVTEQMQ4PI0rZAr07J57DSOSGG7zfIoo6yoLVVBFQcqA1iGoq9K8GAhFF70pDV6ps022zTXrX8qQ08//Uw40QUx +rb1ftJbl0Qx4ML39NsPS5Zf/NX3jG3vGDNgeq4RQYWy++WYxy8bhS5ei5jf85dBSXQdzjseivDJekOd6bBjiWWeemp2o5i2P/BXBR4FT6lXFar04RY8ePdIO +O+yQ9t5n3zTu9TFp9G9+kY2GzCQK699//XTxny5Jhxzy3ZznHjlK5MRIjj32mHT/Aw/FTvQjsxPJd9RyaCTXXXbZOeuNQ339G9/MRjLmlQnpxuv+ljiJpKy6 +fhvWr7R37NghnPbYvDv66ON+nvbda2RE/7nppVcnpvvvvjH0tVmGU2GoBw541UjRYsz6h4vOj8B2errkL9eGU24Y9YrMazm9EmW1JH7xzvENKwRm+eBb8xEg +2ZVxjW4nubAFjqDnInBXOPI4JpgcVjeOczlng+6h1wFOpQcfxpt4U4a9N4a4RKT5ubLBsxuI8atyNW6eV8tXoSBCs2naUXkM6S5qvh0ijGZaUhaBZoCUQ4hf +EQexGEYYZt2zcwGMoRt0D2OLxwgiKS9V5T4Smzx7rDsgO4N7hIxO50ui7zxqj31ifejhLKBKc4WhhevXu0d6+81XM74M+Et/tHASumpa3R7pmsZGxu6DQvjz +0/DtN89T1sqRm6SbQr7gwE8Gfh3VyIYP3z7WuP4Wi6TvpnW6rp/loK6yDgmMb+/3rXxer/EBjntVyWeecXo8NDg8l6v3ajmw4JRM9R951DFpXhjb9Pc+TMcf +d0iqTlTLKF9oL/S6rrRX/vb/9n7pR8cdHN3GmdHVG5cu+P2vsxOBIYgoX+uxKXoGs9JCX2zi2GOOjh3tvWPM2DfrTlm6B4d9sQ2tCVtUl9M45xTKyJOM2+St +HYHZw5V1fRQdJmjA0bJxGLSwRV3XPn1653tsmNOAiUfX1R7lo9ehOygfLOWa6tIpgBCzH/bXAYJIQER3h/squSfCfhqRQB6jnfHee2UNIaIAQqtDIZTXE1Ql +3nVD4SIKIeASHmFhTl35unsDBvTLXZssqfijLBgijMcK1u3ZJUemWh/tyjAgfeoxL72a+8rGKuBWR1Ju3XV7ZLBVERVH/dXNzSnqfVUiCzDfmDA5Jgv2yPwL +KoF6FZ6xY19LTz/9dHbGbbbZJmkJJTzgEe6dohv2u7MvSB9/8NYqNOAqIwk2hZ8yneuarCs/YDg3wSLh31gMj+Rfy4HnHp3vHM708CNPp2lTxsUs4hm5noCh +TMX71FNPxS6KMVkvI0fuGLoYkGHBV2nf8+uj0lXRjV4jHvLbbrvtMhx/BBGJjV151dUxGTQxdNs27b7brtmR4UCXtG7sJtlyi01yqybAqcMO2Au9sh8J7RyR +3RkueBhVC6KMnhX63VeuSV4GKPalrqn+CjMQx9ixU8b/2Wd24JRH+/W8WrUqXcjW4Yz272HD+A298FZ/0Kiw39y4FARlU6PFqRaNV284RJSKPFw3j2IogEIR +S0wYsK2Cd8qDRLIYatrSFCtGK3OUqg4lVGURZvVsCnK/RIamgXNuKHlIpgNcuKvw0fT+B/FkacfygJYogz6bHBm4+xxhwpvv5G5A+NF/JTNyHfOetdKV+a+b +Ky9W6nkVzpxdM+PC6YLYQrPFJgNi4XZSjnRkVtO9996Xd3vH5v/IIptlafz48Xkxl7LxIzGCDYYMSO9Om7QqL9+IP8qRlfdBnHvueemZZ1+MJ2x75XFU3z59 +Vt2v5cn7wgv/kO684/60TucOUeesvPujOlMt16NH93Cit9KwTbaJKftJuTUlv5qMxw477NA0aMim6eOZn6Q/XXJ5evSRu1atxVT9dQ3BLoz1ss2GDcyOoj6+ +qqPpfp/00xPTJptunZcILjj/3PSPK65Mhx16SA4iyr/zzjvp3tjNbuHb+IfBCxTg2CzKHtgQx9JT0dUG32K24Mv+2A37wT+90416DJ3dgcdG0A0+J1CHk4CN +d0MJ+I1Xl0Q+2y6blOfnyTg40eRg6/DC37huFjU7RdCcRQHEumaQtbAKiFEHITZ71oiHEMD9MiRM2FHOGdVxIBQs46GMPBgSWZXBkCQSORfVLSZab2gdmzKr +0nKhlX+qgZWdGGWhTzkLeroFWiNN8NyZFgRXd80qDC89MYaTCO+rUsWLjprwWRMZfTZvaizWrp0eefTpdMGFF0XrNCFmtl7PM4mjRu0eRdcJYayZNtq4LCJP +mTK1Vl/1ywBssfmvFHjAr/huvvnmPMvVuXOndM3Vd6Ybb7w5F680Kivdd9/96fTTf5P69F0vzu9Ml1z65zCs1U67sli8kwLP7WOQ3jn97OSz099jecFs4ph4 +ZOPHJ5yYnWj9AUNj2nt63sQ7MZYJbKmSKi7nxtQ9unWIMa4F+0KD/EqXVurSoKFv396hy1Zpg422TEcecXh0LY9Nt97673TTzbekn//8lzFe9S6FsobJDozf +4WHwbNKajXytB3tyj56rjZEhm1A+XDk7D/uUz/Y4AftTT52pU6dmp6vDFffqzKEH+sC2gfbdOJxLyjrgAYO9G6fFmlsZt/DcShQjN3EAMU+uAtFXdY4QjKjD +aRBQnUFddTiI2Q+/ooCkPHzVIQmnNssijXpwKKffyvj9atUQrm7DxPgZsIF8EVaz3LKpXxRgIDg3NY+mumGkrTDwtzqtNoDVeavP4K/JlH1N1ciXxA7zvn16 +pvMu/Fs69Rc/j9vrpF/+8rgwlNtC8PGUbhitPXhW7G2KldRFp+RX4Knn5ST+hs8qB78lhP4xVSzQ7DBiWPo4pr3JHW/qV1o8TNh/4LD8qMSIHb8Wi8ExVo1F +Vj2LUm4lzsxTaf1H7bFdOuaY72W022y3Y9orumuMnE7qgYfevXrlMtUmXORxSQTeWXNivBXbgMy0ki0jg0937vvfPy4dGi2QQP3BBx/mcq+88uqqiZl9v3Vg +7mazJbbFNuClyw8//CgHdg5SW3tLNXTOqMmA7VYbYV9wkxOZgGkii5xcgymRhzqu/brvV0+GHeolgAEnG2ar+OGc4MPrOuPjTSojXkuBAcgIAiDEA6LrVZG4 +VpYwOUAVWGVUPiRaBoZg9g6j1Zit3RBSUWosd8ZsifEOHJgCB1wG2CwMz+Pa8qvBZClkQbSMfq6nd0WSMkFBYGgnJPTPi4euttt6w+zQtV79tZovikpw/r9S +VWAus9qPVgUZ6z1jX5+Qvj5qRBjNDdFNG5IF/r9gFt5XO5Jy9v5J7kmhhv9KWuk3J01PvXr1yDLxuETDVB1Jnn19dp9oIbSmDe+tBL+yanmf3d13PZJO+MlP +0xGHH5bWX79fNo6GsBueV9oz7JU0egFMp04d0t8vuzw/Acye6AFeduOXATryeCh2fVgi+MlPTkjPPvtcOve8P4RjCIjlvRycNkf6MNgaKI1xOQX7oWcTCeyC +rTBmNinpXTF69sTZ2Cv9cTo0uSZrAVp9tsi2HMqzZ+XYsXto51BwlABSghY5KMd2GlfvBUCmpADmEYNIRCjHyZwrC7Dr+gtYdQ7E1cOGQAS6V5GCjRlCYcxV +yGC5V2gyQ+SJ0DZp3LiJmZZMXPwpzJRmduONNkjPvTA2O5168BAYRYrCn8Rs2qbDNo4I0yFXh6sma1+T3iyD+2q89V79reV1EWuqefXa72uvvZF23mm7dPEf +L4pB8xarnAg/X5W+CoaxnfS/aGFkXWNXg0khL1VsSFNDHFb3Z3zoYb2y88AOkv9ORQYcqmuPgfGA3/x0/vmj8ybYDTfcIMsfDf+Ljq+i/dP50aOIx3b/9tfr +48nei3JgZCd0VX/JQoBlKxV2x44d8tLE3/92cejLU9Bla473ArIXemTYnEpLQX3y6Jhd0Xk9p3uHWWL5yknsCh3smd0pD5BALYhzOLDwVXpBpSVk31pPLT+a +0aHM/PkLsn3Xa3VjYqX0JSuw6lCVeQRgGgGmtUX59dZbL78Jx85fwBwY0ApgHuEEJg8BYIHh2m/ZxlEilToIzi1jKJ4AJNFUXbu7X3zhiRiMvpvzqwKqge66 +69fSJ3PeybDhBs+qt6c/1Z0777NYYNw9aCizVfitMN6L2cZ773kqtV+n76q8jKTBHzAl9dX9cmKszZr3zpteTznlpNzyoTvTF7ySnxeuvBQvPrFP8O6778kv +OQGn0lFhMpSG+V++r52yN09LNHPmrKz0Wve/fwud5GrSp8p0dZnS4uHpg/cmpt7r9UhHH31UNqZKO14ddlWMGfNSTHA8m+6//4HczQIHbQ3F8fEH72Zed95l +q3Taab/IY+gbbrgxTZr0Zh7XgEsWjI69VBjk5Bg0aFA6+aQfpSefeDiMssywKaunorfBdqzdMGy2WAO6Fk6LBb58b5VVVhLM4ZT8KkMmbFM9j5iwVS2cfPVt +MoazOrBuarvY7qS169OnT66HLjjsRaz8NGUoGIEEUEwC6FwyRgEU4ZpSZfVPlbNOQElgaLk4jHxlJQS7T+gQKmOHRMuW+uqrHytGuHJg124fOMrn1LhLeuaZ +Z+Nx661WGXMV0JbRRTj3vPNjsHxS2uVro/LiL3p7xINlt95yQwxiT831wPmyIzz3/AupVbvWqWvnMvYryP77LyVJZILmLyd0LF08NW2//fdzlwWvaM9OEJb2 ++ONPpJ+ceOrK54YWp6mT30h33nlXXrfIZRoArHJT/8tJWe8ucI8sjWG/XN/YLcw/Nq7GWHLp+5lmxuR9fV8uC36Vx6hRu+boTP50WPNNAhx02EmxITeeDo6u +62OP3hcPSo6JWbvOWVdVBwEp9V1/YI7UHqWwYdcevgMOODSwLImtTgdE8O2Zd2hYr9k4Jl20RGgCA15p6NChqVvPQdHy2ARbuoL4FQx0a3Uli0OVRVR1TD4o +oyHgDBKYYMsTnNShv2rHbL32mJTVZaxOZhOv2dzqhPKr3Qv21Ufq/Yqvqe6VGTjMcAjX5v89EiHitG3rmf+YCQsiEe2+iG9GDQMO/VX1EUUJDi2DX4x4k2mn +eAFJxYMxhKhj1dqEBII5nhZPQguDUX9k7I6+4sp/pW/H4p8nY92rrRwcPzr+hzli/P7ci2M6d2quv0FsTD3vvAtitf97mcZKX/3lsFdeeW3aYbth6a47b8v4 +c8Uv/amzeRQRcfhLd+My8EsMo2FCl3TbbbenV15+On17/wPjYbwpCXWDBg3M95RpaOD4tVewpgrDtXO0oF90bBePazhnxKsS8uLSTGu89yg/K/TmvCmpd691 +c/1arsKtvwzpy4nBXHnlP1O/Xp1ik2u/NG3a9LTTyN0DVq9clK5XJ8EjJpHCZv58yUW5dXHvoNi1cebvzkl/uviiNGSDzdKtt92bPl0Qzz6t1TQ9/+wjeVq7 +8FAggRHE5zfc0jHdszt0MmbycXAIMmCL7tWD46CLbtHPRmpij+7Lh1P3TvAGg90J2vApBz59wyGPjcJb67rPXuEFC57GraIiQNWDFTI4LU7ROHe5bDBk+Jjj +BJ5r4Z36k1oTgAwgOQrHAhzSOmXoeSOEaXbBNXMDlrx6oAEs1+7V6Xjl7QSfv2BhvATk+iwX+ODACLrRfGzMOD3z1IPRhXoqvfjiE+meu25OHogjGGXBUVYd +yYzUC89PiPxyXfPzzQZ/0CKBUVND49cCSOhvmGqZI444NO37rf1DubPj2aaBuWXt27d0Jb+M03vi4mmk/3KuCpOPaHE6RDcDLhM4UTDn1jIVHvlsPHRg7jFk +Bw8SKz21bPkttDd8WQsYyjKyn5/y09Snd/fcNR06dEg4xAU50Llfy4HjfNKEV9Luu+6cnch98rKw+rszT49Nsn+M9ZgWcd0hnvhNaccR22Sd1bqFFq/u+iS9 +/96kwF/sBwxLLb169cq/+GLQfvWK2BhZsE159RztumOV52prxkgcAFyOww7rPICyuntsiXNJ4MHHpjmV+9WBBH/8lb2KUUZTzCkMuhCgcm0xOAoAVoCtIBMY +QMo71xWRjxGeizhIa1LGPUgxqi5HAR+BiK+Og1kMutbimTp3Xz44nh065ZSfxcr9BnmBU5574FaB2cXuaJjc40RgC9cc58knn4qW6hdp91HbxfM4xVGU+aqE +ZqltPNtTE9wVp/Ge5ClavFIEXBXeRhttlK7951URWcubTHVT/lfy1qUYHWYH+D9lgg9p2rszQlfelxBTsf+nUMkgw7GvTojWW5cl1vFivPlVqe7asJt7j1jv +qnz5lXbYYXjadNNNsk58zYJeGqZazm/neG+fXegNdUIO7OeEE36U9t9/v+wo5OLhSq1DlWGF+Xx0tXccuUeWI1kyVDDANE6yxiTfNTtjc2gynjax5LHz7Bgh +ay/7XLoiglsc5MHp4KNPrREYtTUSbNkRmMpJrtl31b/fyg9+2S97bxZraDYrN+Y4WhKZiMwzGisBdevWPTsXHVZCOIGEKMoE3B4mTTDiIEAgxNVhOF6tr+vG +YdXHYHUw9zGCKQJBB6MEEw7RaqeRu8bEwSGxs/mRrPTqRGA5lG14qCffLwVyIltehg/fNxkUG1iWcUn7XAZfyjb89V5wqfa/nVd8zhs1snWqbzwO8WhsA3pG +Vk7oqLjx6+sMnEj+e/EELWW4vzrF66FC/p4ZslAsVfrzhT9RvG4ero9I577cygKVdg8QDonA431+dOBhwYaplrNlptna68VbYm+L3RZvZJrca3jQhzUVBstG +TJygXZlKP5rW67FOuuXf90SLW2RQAwk7UE6AGzRoYGwx6p+dqOIAS9lXx45Nl19xbXxep204Q3kuiN0YThg6mBggOy0Reerq1YP92YWgPFweaFwSr0WT593t +4OcAH/c4MOeGl+3jjw3o9SiHLmVy+RAae1ZOb0mq+WjS42LDjsYMGACAJb8Aa6E4Vm3mEO8asYycgtSTGKRyjG1evK2lejg4zqvAXSPMGz0JRB0EijZgYgI9 +8MAnUWKFIyLtPmqbNGrvI9Jf//r3DKMqAi0cq+Ehrx4UYlvK9ttvn0bsODSEUx4rnhUvR+nUpV0o8rWMT31JPfw88cRTsYVmq/TAw0/llXX34KzlJk6cGHxM +jlm7HhF1j07PP/98rut+pU2dmm6++dZ4dqh0Ud13SB999GF6KHZGbDxsq5jpKlPyFYb7Iu2rY19PG26wfsbdqWP7NG78hMgvK+5kjGZKtaetS3zx4v33Z+T3 +/D37/NgsK3Ckagx2YKy91hqpezxpe/pvz8ovVIGzysxvTYzYg5UmT6RaxrmV/xeefyJtMnRwPMZ/dAS6h7MulYGr8ljtoNavMrSOdPyPTso7Gzhq1TkDrkYt +n62wJ70A43YOIbEXAY+jsyF45ZGFPEmer4uA6YU47hmTsbeSik7BZKOVdrbZ0B61gnpYnJhMjMfYcpMBAwaP9hpajoFhrQBkKiMCUJ4HsHsQE4j8ipR3ugc4 +4binP1mZ4Agiky0ejNOgGTz5HKY6KcIkMMCrzSl6lJUvWm22yZB0+VU3p6efejKYMegr06V1vIM+9DO+yZMnR2txX/r9uRfGw3BnR3dlZHQJtJ4l8kTRmOHr +HN2vu2LXwLrJ++TQpu4VV1wVW1suzrNNZsnmzJmVF1pzlA9FaIFOOOHUlY8M2MXeMva/XRr9+ZY5wlEUHsBitH/581/TBedfFlP50+IdDr3iratlkP9hyMWb +ghaGbDp1ap9uuuXuNGhAn/ywHXmQ7+WXX5H+ef3t8XTsOqGD8gj325Onh0yWx9hrcJY5ed8cM233P/BozLJ5+1Os+4UcvVPi3cC54YYbZD1bq7GJ9gc/PDlt +MLhPlvXEcN7b/n1n0B1rMRGR2QLaGe+rr45NF170h3TzLXflbTv9+/fNYyX6evvtyekPf7wkXsxSPuMD7+9+95fQ8+yVeinrgeA5GDojtsj++uuvp6tiP98J +J41OA9bvFW8sja+BRHBlP1pA+uZIJcB6cWl5Fx/+awuF5+pQZMXQa4smP1BmeOyJDbENdMMBdrVXDl9bHfaoDLw1waOMg51Vh2Ur4DTyWRcCg0CharSUJ0Hk +vkggcQQEcRT3lAdU3eo86iqj9SK06r1w1AEbxtRh8H6VI2jX6tb7GKr0MUj9ZjBEinfemZ5eG1seud5++M7RKvSOuX2tUJO8jvRhbO2fEI9bz4w9Wzvvsm1W +BAcSDKpiCcI14Tz26Csxhb5lDLDXi2nel2M88nHacvMN86Mg0RGLB9/eDNwt0sgdt43JjwXplpsfCcccFq1piYyUY/Fz8uQZIafF8Uh7v+irR8sxbnx6+aV4 +J97m28XsZYf8vu4nHn8wfXPfb8fO9Z7RskzKhqOLgkcGMH7C22mLzTaKl1C2Tc8+Nya9NGZsGrnzdiHv8vgAXOT76CMPp+E77Bhjx0FZHi+MeT1ahoEhszJ+ +pB+yfWf6B/lFmd/+1p4xCzUzHsr7VzxisnfWKX0xOOOg++/TNZsdLf9eEeDWjE3BH8X3pj6NcVajCDh2FixJTz35cDriyO/l2d3/3PlQ6hsTEt5w68sVbMGW +oGnvvJffJ9i8Vbe09x4jo5WJ97/FUooZRb2Wt9+eFu8RfDD16bdhtLIDsz51eWsLVO2L7iVwBXT32YbEnhbEjDB54TEui15TeSMQ+/xsYXn3vLIND/fIDzw2 +QE5etmmTqnJwkDEHQgP8AqiJmfLUdPENtpMddqutt1+hhWCwjCt7V/zWaWlG7Z5fCCQINGkIAFxTCwYnANhMCwJqBFKPYBCjbiY6ytZ8zinBDZaBOdgcrEYo +9yodlTFwRCn19fc9W8PQzGwZv3lEwuZb99W1Z8vLMgwQ0SnBI4qByQDUmzPHZy1bRAvTOu7ZA1i21TM2OyTsSO/QocxQLoztSfgEhzIpCK/WyzzoVt7b93nA +8uRxbAD+bElqF3U5pJk8Y5wWsSPAm0+9ow6/ZO0wW2q2cr11GWqr/OgKmcAhaeHt/MD7jBkfhtxjCj5mvGhJ94g+6EFramHRxMPcufHSyAgaPXt0CxmU7g+c +ZNm9e/eAFdu5Yjy6NOTIqclA3XmxlcpYS5dfd9JDlJysezevBGiV68PFqNCu60zf6PXev48/nJ1mR+vWLt5V3nItL5DsHpzHv6DF4O/TkDu6G9pgXGaavc2o +2gOa7NDu3LlLyG+mItnolcEHB8gPlMYEBH1ZvKbbaismx9igPAGkOiw4bBme6kBg0Sm+5Bt+wK+OXzap9XLeaOiwLfI3ZDGsgISgamgQmiAAVN9QsrXGuEhL +Y4euRHgMFhzlEE7h1bkogDGDhzjnBE0oBoe6l5zAvdrVg7PWwRyaKkN+TV9K6Kj3LVqus/I7Re6LOvgCS9ONNzgkdShepCNENOrvahks/qGVExE+OpShaHxm +eGFcIr9WEkw0gQdv5cNgl1Gt0SwG9muEDOMl+3j+dH75/lHLlmtnRVQ60VcDDBrhhdOBHnKFp8oPzZ4WFeO0hgIaOpVTBp0O12CU7mZ59xuc8tBM5s7pS1mP +NdSuujy6RJdf9kDPegUCAfiMTT3Jm4IEEHKtEwHWHZ173g0Mzt8qvkEVb8HJsIrDl50I9KKMQOHpajO4cIJHl3CTQeG3aZYf3IIcGc38uLwvJOOJZ6T8MvYK +lx2Bgz+68tLOL0LfZFNtDxzl8Yw/+iRPsiAn+OWBAXeTvv3WH61J5wgKYkg3DnKRup771YzOn2+ae1EG4D6i1K1O4QV+ALuX36kdzgKWPq8yCMMUQgjEFnye +PTcmA+aFoKyVuIcJSXnn8BQcBRcclelaNtMTjBK4soSB2SoIBoaW6uzwMFyJcNBBOSI4fqshikTwo4VyJee+5mAcgrdKp19w0ZZDbJT1/rwu8apg+wrNPnIu +kx8SWSjI0NBc6VscdKDNATf6HHhFl1QjZNFb+YIIujmGg17Ac1+3ynoeeVenMTuo9xC+kN/EAyZ85COf3JyDhSew8KwX4Bw/6HEP7c7J2Lm8InebQst2HLOJ +kOklfBEPbGqh8Y5O+kRnhYcOsDgkeulSUoYxu1+cN7auBW8dwomyQUd5XyIBRxK0qm3jhQz9gknf4Nqho7w8NIOjZ0RWUTzbA37oFt/4Y2sCqrJgNBkyZKPR +biBORo3ezgGHjMJEL01hJVB5RHJCCJSnxAoYUXmHbTAOuboMHFPguTbdiwlEwUN5yoKBYU5QFVuZJsgqcH1aeNWnEC0bXmzzkK8OwTh3VKNAr0OCA0+Uhi78 +5ygVecqg2S+4aKplfM0hKzobWDEytFZFWVT8PJThq9/V6Hz5gfH53I03vGZjj1YKbRlWtHxmEZUnP7JSBk0UCTY9uC+pJ48uRGwOJ8lTxsFR8QiWa3yQvetq +UAxCa12jq3zRmVzJjxyV8etVxrqtgim46K66qS2cPLiMsapd5S+ehxOpK6GDngVO+tS10/KAYWHaxlM8sykwJNfoVhd8PQFy1lX3Si5dUu8/52D01irGruRo +ls5LM8sXV8p70A0fvOu77rbJCLTqQR968Bek5USe5FDxuyYjtDnHf5Nu3XuOdoFAXTKCZxAKMSDCIlDEA+SwJiBff1pXQtdF9wcyCCjULwOtTLsHDyLBsIBm +IU0+7/dLWcqL8HnGKWhSp8JDsIOg5CtnYoGxEaxyjI7A8IB+SpWvy8dpwZdPiZwCPGXR6hcNhItOsMjFUY0ZnT68ZXcyGAzLh52XfV6cPkf36L7g8fOch9bo +hsS6RphClosuTlASgSSmYJtEfz26tO4brNvRrYsbJOYnjNGUDS4y0I0W4x8tGr7kVecmE/zi0T0y0fXzYkcwGCS9ue8az8pwIvzWpAz4xhpoW/BZeeGiCRm9 +Et169VuuFWsvYQMcmr3gmfXpDsOl96IsuuDUEhuX6TaTO2edNdMrq2OhN/jQQuEdXVpPep41e2bWMdjGVAIR5xPE2R/9m8Ej28aNS+tDvqX1ixYs9KRlgYPu +ygfZyk5vfFpS4bjgsFnl0dE+1jKDpNy1RLsDH+pwdjInO3l+mwwesuHo6lkESXGSCgpVgWstquBFbIy5lwUaBleRUQplUzAYDJAywfJLwCYyasRVDgOEhg7n +CKNY+BwEoL4EnnvwOXffOWWqB75z+RJ+ssAiMlFepQ04tIPrPrwOBohX+aKde4wBTHjAI+hKD8dzyIMTD+Aaa8Enj3EIOBRuYoHToVUgoTB8VDrgcE9iaGHL +WdbkQw4OMhZELFy7pgt0w0U2tSwYzr0nHE1kg3+8OJTFn6N0UYu80cJQBAm7FSpfunRkZDcMw9OFUgZcLQCY8JhEoWO4BDnwtUj1wUWyp/fZs3w1r+jfs0Ho +gRsv4Hxm3BxByURMluPS0r3Duy54CRreoOqtwPbele4wmZCdXz0m5eCT2J1rMnbIR7dE9mjAI/xVx+Gj2YmVJWt6dQ/fVVdNBg4cMtqF9RgMKIAAzIt8zitA +nlgVpk5FiAhltFwE57y2LroLCKDQOi5w30qy+u75ZaDVoDhq25hBwzDGwGSgxWBLS0Cw6mGKcAgBk8rUVAUFrvOsnJWtq0E+5cuvDuk+XHiUCIpBoVc0NrGC +HjNOC6NFVVfrhTZ8kFWFV4OH1ghd+CVTkRJM9PtOrTUP9eBChwQWmHZQ60LqworkFM448YtOdegEfPKr8oSHvOmjBiy8qiupJ+lJSPJNT6OJvtwHS11fw/CO +QnxpBdDGkRivBU4GjS740USG7EbCpwSeaWN1Jc4DHrkKLuQNFzmoT8Zaa91fOLt27ZZp1Jpo/SufelCCEbvSqhT52zIULU7oi/0IYPCa8ofDwb7wDD9Z6RbD +yYbARpt89Tg4+sCGx33BQ88NrWBk++rRc73RIpwmk0AoYO0QHOVIgKngGvCaR3CIMWXMGCCATBdKOcgJVF3wwUYggh2E4FodDCCm4mKcmvbKGJwMxhqE8nBh +wH39ffAluBgDuGhzTrlRNMrNy/eVcR8MggZDWfxVntHiHJ3KK6ubgi/34AajCjsLPMr6rcYNprrkAA651qlYdY0BoniGozuIL2XIFfwi79IVg4eBmxxgiq6V +lZQlW7zUbrlr+OGp/Fae0eMc3+TsnRVwyxOEGD960aElAN/snICly1Z5zl36oENggAN+sOlJHQZLN2gxlilGzchLKwWPc9t4tDCu4SRfQQRNYHAKegSTA6Lb +UIBe1Zk5c1Z2DF8eN6mBjyiaFsdH0+ijLs7WCRI00hG6HFX/gg4eq65cV1jo09oJjmjxsCr8dCJASU3ad+g0mjMgHCCICAZDKlXEujmQYEg5QsColqxGMEL2 +cWBlwEMkhYOjDMbBVpehYFRZ91zDpQ4G3KMUA1YCMltGGZQND1qUVwPigIcAAEAASURBVA8M5eGpExbyCKPcL04hj5I4BiGC4bziRjM4DsqKnxzJRF00GRui +3bl6eIOj8skQ5OMx447WSGQETz7c8HrosHngMgbBo7LKkANnyLIL2oKxLGvwlcG7X3ygvfKOb9dgUy6Dd0+9HIACNl3Br35WfPCAXvISoUVsdEv0K5Gzbike +wNOCqWNWk97JB81wOOAjH3qGDz0CJprQXA+81kCgJQETTnLFC1jZYYMGRuwcvFlBp0kF/FkTIytrbyYbjIFMENAbWuN/1I2JiZALvsgHfegFj4PByabkuY8/ +sq+yxYvgLXgF+Vl+YHEeOExWuJdp32ijYaPzYDiAKYQZrz8qxlTGHioRAsCMl5O5X4RaFlYrsxh2DhaBSIgjUDM2lUhCAJeDilhVsFWJ4DhEPgqpNDBIjBMA +uK7BFiHkMRR01TrV8Ny3VqVvz3gITfcOnQSMP7+itPqacfhtl6nO41eUhV95+7vyYDZoMMCFXxm0OldGywqmrqEPjenzk4FZpEAXNKyVaYeTbHUbReG5MQMF +ppelKM849ftrt4hc4aBJ3T84GWFJxfDpiIzwAT4Zkxc8aJPQhW8BCzC8SVVPljDKNiyL9B9l56EjMFvFuMQEgq4Rm0ADOasrwQEvw6tyq3aDVvfYirU4Y0pJ +XbZBf+TrlW5Izbpp2z4HE/ySA7haB/fYCVhgcrI8vg3dLo6dGOwQvWwbneiHn4OTDzrRxfHYg+6jaf6s91iHAtNYi/zy/YBFTq45Oltp0rlzt9GIdwPwKnTE +QQwJhbkPCOB+ax0MI4YwRQXdgEq4rh3F1Zbo41go40xgY8KB+Sp4ZR1wKAMPuPIwI195+NCjDKEY2KKDAiqdlCoaedoRDxSsXw8W+tR1qINnNEpV0fLcJwM4 +JTCr0Ctu9dBFkfhQFl/wmYo1CUEmCxbE91UX6T4Y9wlQJXpXeOjw6L7dA3AycF0+v1Xu8sFFB524xo9ruHVf6bDqSH1dbQ9n4oeMaiuuPl61LmgoMiifiZQv +uOl2kTuDVqYGLN/TXRYD/8JHacXIp+pRADBUYGgMV100osuvlrHK1s4QPNCjfGXJDz0Wdn1vChz1zLCRRR0nkYV75GCs9Hl8awqO2mLl1iry5SmDrxpY67DD +Dh10o6faGRmRI3kJdO7hPUjLeuY86ISHgynfZMDAwaOdAIYBBKsoVSIZrqlGjOheuI8hjBMUpcjznU4CwSDC5VEEoorSy769KiwGLaooBy9GEa88PH6r4TPQ +6iTGcMopz0gpzaJnpSMbcdDvAcUKD6w6loIfPPUpDH/oJgdl/LqPropX90FCp/uS33roqonolCbCUpD9eAxZPnnhZUnANS4hO2sW6qNHFPWaXWsfeX0tIm0N +FlXR1dnBQptukTzllMG3fOf4Fa3zOChoQk/lB69krwy9oMVWILRLZAAH2XEWDklOymrR6YxzuU//YKHB/WLIZRbXPbCrXvGZ6Qq5Oyd3s23ga4WVc1F161eA +QbeFbOtE6kTBLDvl60ttvOqMYWvZvDTSBAk+BQM6I3v2UZZVSutFt5zSfWXtwC/6W/2eRduh2GKVsbKckd4ERQ4Gd5M+fdcfjTkFJBUAA7jMbhQGRS6IAUUU +hjBIABhWD3DnfinVuToVnvIURIjZqKKOyMeB3UMHhanHiU2ZLojtSLPnzAuDKU2te9UhDL4xY+xScYmGBpZwcGL0ogfeHF3DyPDmcA2fbgfjkLSY6EWP7o4A +ousbf3I3zhQu4yiCLFFUqzFh4pT06ivPpS/iy+geYQC7DNJ1T8uWGNPV5Pzw4y+nt9/5MPXq2TlaoegCRn3y1B1Rh0FaK7FLpFMMcJ0X5ZUuK0fBb7OmMUER +eBrFNhvyzw/wRSuo22Nig0pNkhgoO9e1RRcZwUcGtUXlyN7f/VmMM9rF/j3l6BwtWlswc4sX52bPOJQAhl56Iy8HvdO5eoUnzlGCjvvkRgbFqWI8GWXlcyY4 +TXnPmzcnf5LStSDkPj11Xqdzthuv6qJPupkfYx280znnZvie1WI/Agka6A+MEpzLliI0uGfBHA/4F8TQ7fOh6Cl6K/onczTUljzDD0fGa6ald59+oxVi2DIo +iIBdMxjnHMq9Qnx5KR/CMMCoASVA0Y9iIOSpYJQoU56BEbl01WqXj4ARhhgH5uADk5AWL16WZnzg0YUB6aPoFr4xYUp+CSM64LB9hKApEg8hufgY8iex1UjX +zTikTCqgFR5l0UlYxaDKY8XZKFc5TxkToY3BUQLajIU0495MRCHGDehgZG++PT0dfuj+8T2iE+NxgD7ptjseCAfwBtPumS7Gwzk5wyNPjku/OfW4dNTh34n6 +S9M119yS+q2/XsaDLokc4Kxf4StjgqIjfMMbpYpzhZPofljgDrsJvksXq1mzJrG7+pF4bVf3CFYd42uHD8Qm3t5hjJ3y9HXtPnFe7wW8/74xabvth8W6Vpv0 +wCMvBf2tY1+gKfYyY8vw4a700S06BRH5jNJ1Xb+hd2WKkZcF2hrQ5Bv7+ESoVpkTaCmyDgOB++Tl9QV0UBwKX6WHYOkBXPZENqyejsFnf3CTEcdQjsOjr8Kp +vLvWa1C34sYn3GyCvZQF8rIGB6Zy9K8c2GzfeaOdRu62AgEVgILWPERPUVolFQgJQkgk1+rwZgaWgcU1Y9Ac1zKlfF03KE1sNXwOox4H5KyVhshKTz4zNv3k ++MPjZYvH5BYL09fH651+PfqCtOnQAbmLBI46DM6ugIcefCU+tLV5fun+008/n8a8ODEeFdguK0BZ/FA6BfjFA6FrhbRsNuOaISwGYKaoTK7YTqIbhU/RSuuo +nunQKVOnx2dJzokvUWyXnx71IpGJEyelgw85KhtlMagmwV+8OCZ2L9x0w9X5cXldEHzffvud6ccn/jKesRqcFY4WrQi+TIbQhy4LA/00d6nCSCJ6Gn+ZyMh9 ++DCm7EhB44rIbxX0PfjAY+mPF5+TDj7owBycpk+fHs8UXRyPh7wSj250z/Qw0sVBU48e3dNpp/4svxiS3t566+34ftJp0S3iRD6sRS5lX5pAhCc2oTv9xRda +hrL2Qp9sBC2Mnu7JmT0wcHLX3Yvq+VGM7YaPTOPfmJLmzpoaj5TsH4P2udkZ6AoOXXi6MFMn5S58BEQtD1zuCdjsl5wcAhCcDjCMwdCBLrgFM05C945Fi8sY +y1Q/3VabUJ8OwGTTcOEDTMHeAZbkvMk6nbuOdqKgyjyX51l8BChHxriHCNc8kEBEeIAZA+QG/PIrkYQBZiFCi7F6cA85x8AcmGY9GITIwmA+/nhO8tzMmWee +Hu8x+yAdecQRObofcMB3otFZkh57/NnUr2+vzBSnf2/GByGEBemmG/+eX3iy88iR6bsHH5h69e6Sv5LQvVunTBsjEBjwqLVFP9oNxku0Ed3KtK9Ibdqb4LQs +XnLhc50dO3bIgtOlfeixMemc+B7S1/fcIx5m+1286PDr+d3YRx55RLSig9Kvf3VqPNvUL2B/np579on4fOcD+WsRJ5xwQrxl9FupV69e6aAw9AUL5qX/3P5A +6hXvlyuRMF7c2LVL7nvrUul+0AN5iZKc2oDX9Dn5ruldd3FfC23C4sEHXo5XYp2dfvCD4/ITuxdeeGHabbfd8nsZ7r773pDpR2HYYVQRILTe1137j/x2Jjzc +fPNN8bamb8eTxNulS/78j9yC6R3UaEw+aEQL2fmlS91jBk22WnH6r60/mYPBZjigOjffdEP+muLxPzg6Xs81LF1w8ZVp/b49Qt7FNtgFGMY7z784Lo0f92I8 +ibss3k0+KfVct1tQUZ5PEnTAU1Yij0pf1nMEI8n9Yr/lKQO2zkl06fQuyE89vw7OB5Zz5SoMPMAnH4zqpNFzXt26cIQ6TakA5LW7xoEIhEMNHDgwA2GM8kRX +ZQmuGqn6PLYQWyYGasQCQz7G1CnP1JR3kDOcZdGNOvmkn2QnGjBgQLxK9yfxlbnd09ixY/PrtSZNfCVeIH9dfo3WzfGAWs+e3dKD99+eX9Zx66235N9HH30k +HRUGfeSh30oz4it46KFEjiFVYaEb34RDSIJK7XqWBb4yQyNfN0JwIAt09u/bLX/E7LHHHs0vfwSbc/zlL3+Jz5tsm86PN44uDV6eiDHRtddel59QPe644yLy +r5+N78gjj4z3N7yXvvH1PdOH78fYJZyTUTJQ6xzkxZm1ZsYrzhk0nZj8+Cy6Q/LQgz+y/OKLFfElvOHpqKOOiKdgn4qv5u2YH6/XUqL9oovOjXHVkpDJzPTc +M4+n6/55WR5j0psXf2699TbpiAhcvvJ++GH7xxuZxkfUL7NoZEVOElhaeDpmM2iuwZVNoIeOycRyA/qdP//cy/EBgMvzS1XOP//8eOnk/fH+u/3T1ZdfEC/r +vCPKtg8dmEW1nNEiPiN6TzrmqAPi8fk3ws5ejXdLPBO09co8CGqMnW44cZ1QQRt+yEqwLC1HeeoATWijf3XYsAMcMtKd7Bw79dmmeupLeOJoYKpLFmA55DUa +vsPOKxp6MYFQJmJ4nAoQA6IcYSijxVLGPeUYqcF53M5lKyNmrWwlgVA98AhcF2bSpMn5ZYP9+/eLSP5eTHe2TBPfnBLjje/EeOOEMIajciQVvW+77bb8Yo3z +zjsvdz1ef31cGNiszLCWCh39+w/In1M89NBD88sGfTnBm0IZ9V57fyt3HQhdYhBoZATq4s2BJ/wTDoVymFIn+sQxA5QH1/FskY9zHbD/PvmrD5zjkEMOCSPc +Ot6iOiX98Ic/jMfb78l4yEWLpJvy+uuv53Ivv/xypvuMM87ITj9w4OD8oNrOu+wWsoO3vKHWoJ68m0QrTemMiwzp4sNoVXS/rTOtWFGmbyn/kYfvj3dC3JCN +0/Wrr76aX8j405/+NPnc5EEHHZTrMxB6owuOP378+Phk5yUZD8PXFXzxxTHpa1/bJb7wNypkV3aP0KNuGjnpYmkZyc8ak5bIOcfu0aNHhq+ngmYtuE+YXhbv +Bj/qqCPjW1J75Rb82XiDKxz77/+d+HTp3umeu28PucW2q7bxTrl570bA1ELuFwFnenxz99YIXN/Ige7Agw4NPIKONaDSvaIzTkHmNSCZQbXrHl14pXcyro/7 +eFyFbtSpgYJdVAfiTK7xRF7w0Q85kEH1iaYQY1RBvwBSgIIOAlGBQHR3TEMqa3wAiKjBUDDB6KrnglGMaFlqG4QY4LnO8KN78v4HH4eCRuT3Fjz2+NNRb2Ee +yI977fn40sJfsoH/4x//iJc8npcNkhH4rAncXhLZo0f3zBg8hNO1S9c0+vTRiVFLBC5ymAb1VfIqSDxK4DBIi4paGvfBko9fCf94k/IqeeCx1R+ItyaNjUfJ +f51hTJgwIUdz5XQD0XPjjTdm/GD60PFmm22Wneu3v/1txqOs/j1jMFbxQWIOu3BhTJhEV5deyKzMyplNLVPt4Okmdw/+Q4zZkO19Wx7OZJtMfGkpbbXVlvld +FZtvvvmqlzVqZX7961/n3oSgZVzTp0+fbFyXXnppfP7lPiRl2nw5nh7teCl5ZZ3Kfj+9h2qQy5eWLhU7IB8Gh3eGxpDJn82suWb5hvBmWwxPe+65Z25d8ChQ +jho1Kj730i+6uvvFezOuzO/B8KKTt9+eHI41Ku0QXx8URH1v97LLLouyfXMP4vvHHZOOPvbENGRQ7wio5cMJZFPssPQc8qMTjaMFal82UZcJovLoCT2jMTrK +OajTPztYHJM3aBbAOFYNGuCWgNAs75Esj6SvXiJqWjyOELxjwZaaMjUt0jAiDgSYZpyBOa/jIsTYWoJAA3FCRBxBV0IogvNohmkeQbfcckN8gfz8tPfe38h5 +3//+sfGV7IfiBR9X51fW9o3X2oqQP4kuHVySSO8FkYSsKwY+Jz788CPSgw8+mE6MD1lVJ9Jdui8+8NX1pq7RCoyL2rMy/QyQcBiJPWIU7yADvDinDJFHRCI4 +60Nmdlq3LtHHGMTTrhKnUU6Xqlu3btnxrrnmmtxd8U0ghkomjPT444/PLdK0adNyXX/IigLxMnnaxzHuWzc7CfzoMKPFGLX0Fhy1PKI+fCYann9+fPps0bK0 +9VYDwjHKTF///j1zK3/vvffGZ1QOzfDheeSRR9K///3vLE/GiK4bbrghupzXhgwPzwGzEkZfNdLKM91M723bNs7vi/PtIDDpffbssj+PnvBRHElAKOuNxp+S +1wD4cnmnTh0joNwd7xo/Oudr/YwTPZHbu3fv5POb7KcGPDTvu+++uZsr8Dz66KO5N9StW5f04YxJ4Ui9cosCmLdTsVEyZcfkZ1LGJFIILtteeQzdpgKbVdvH +DhKv+ipbheoEhplYs4jszD1l8YU/srENzjII3UjW/TwIkoWiAAWqEDhzJYBEDorDBEAcyhS2roZ7GHbfTI37xhciOSSYMong+f5/XXd16rne4HjjTb908HcP +i2/xHBUOdXNEyEFZEF4K8qPjj4sPdV2cI7E+cf/+/fOHrwyWn4n3pT3wwAM5j1LRaabv1ltvzUbws5/9LDPlz5NPPplO/tnJ2TEeffSx+IJD/5WOUbpvnBmt +nm/R5ZBcF94bZWflcORhVsszQ5yNsSoXSyg5aZXxyZlE9XPPPTdPIuirk1HDpGullSKTmtTTXSOvLh3LU50CGKdGF2fKkwohYwumxk2S+wvjsy1/uuSM3Ipf +dvlV2XAtEYzYceusfF0zNF999dXpsMMOi0mYk/J4jPM3TL/4xS/yPkY0cB6yMeYtvJftQtF5WrkT2xYpzw/FwmvszF68uAzCXbMZtmGmEf+MWauq1deaPvzI +K+lXpx2b677yyit5HIYOMhesN91003CiEVmW6P7BD36QW3JOBB77Y48mn9iYb0VJpadQ4HToUNbv6CTrKeRGhrqgElnLr8kaJb7xzJadC6b4wJND0sBYRFcX +PGNJdm94I4Co17QaJUeiZEqyQEjJjImxQASoe4CJ6BByIIJSFgLX1QABh2xqvDN66y03TX++9KIccTT7ZV9Xiu7O5nlB9rTTTosW6vd5ULrFFptl4uEyu/Wd +73wnxj37xKdCLsgTHfnmyj+MlSE0TN6x5tP1finghpvvSgMH9A6jspLfLDsfJWuBjXfwIgLiEc0ikIT2OTHI19Li2St3RWHjJt3bw484Kg/Ix417Pd5990Qe ++3Bgg1WJgVSlkQ05Mo6axyjOOuusmN7/fjbavn26ZwVyHMnYjazImUxFyACTeRgf36t98L5bcz0R9fLL/py+tute8aaiielf116R69OJMc8f//jH3LoPHjw4 +5/uDNgld8hve0zJ4bEG3kjzW67NxuFHpWqpHTsU5ypIIGHRVZUd+7itLtn7JLC3/qEyvBzABEF+6xMccc0yeDDFeNIzAK95NQuy0007xCuqXVgUltNEdRxof +gbZzt4E5WOhp0I26DolO2azWCI30yUHZjHMTXpxHebaPTvavgXAukJABXuhM61vhus9Z20Q31z14YrJh5ArA3CQkBEFSjKZswlwNrGwjIgjEcBxEIBCxdbyF +WYS88cakNHKn7fPnGkUykYiCHeBryocOHZqVYIzEKPfbb798H8P/+c9/8qA0cxB/0NgwYQIdcFMkhnRVvvvd72YGp0U3auhmI+Pbpv0zPfWZntoqWI8x80Xo +BMgIqmBNwzdrFm/2iXUILZGy+s5W/XVNnn76yRiLbBGBoHPuw3vxZE3oRFfDJA+dUqX3lFNOydFNt+/iP12a/nTpP/JLFsm3OHt5GJGhkrEF57HjpqSzzjgl +ukZHposvvjh3F40d3nrrrcyj8djtt98e3ea9cyD6zW9+kw2CrCpd8Luu9Lz44ovxsa9nsyHpEt511105+qOjV69h8RqwLUNfZUcC+ahbf7U2Fi0ldoJvumBo +nCUH3GiZXhk7IT30wG2Z34r3gAMOSGYuR8ZyxVclk0V2vsAlCbg77LBDTE58LX33kMPT08+8EO9qsPBfnlsKsjIPNTiik17RxKbhJVfylNhulYt857UVxUMN +Gnbre4tTvac++1aHHUtNuvdYdzQD8RqqihBABY1BFK4RBDESQpWt4xeGyakIjTApfWkIeMyYZ2Jcc12+FnkHDRqUWx3di169euUB8R133JH7wE888XgWGuei +/MOiO2KGqTJKCF8+0CBPmXpvo402yq8E1tpsu+12+WPDb0x8KyJR2XVRadRa6Ue3bFkeuKuCBlMZXVJ7qFa1VCEPfeWZM2elDQPHz352UnZ0vBhj1GCDjoYJ +bRK4UuWH04+Irsy2226bW7MNN9wgnXXuX/JaCoMXdeu4R+vFYMF+I9ZTRG9T28YXe+yxR3z79e/ZcQQCs4e6SiZpwNBl2mqrrfKUe6WlOhSaR48enfkQ/W21 +wg9H9LvLLrvEWtOcMNiXwgF6ZPrVqYYNTpGnKeEyGCdHePDHhsj48adeTeeefVpuefQ8evfuk+644/Ystz59+mS4YNVU6dRbqjpmg9bpTNawu5tuvDk+ED07 +d3kFYbLS4+A4cKOxypxTS37J0L0arNBYg7BfMmTfunc1v120TGiRKj1otBGAf+SGZPjwnVYYOFXh8jqFeKsBrnyKBKAaIYKUcw0p4ZqD553u6br5Up7HBv59 +643J7BuYFKXezjvvnIny50c/+lHuv9cukckEfVADv+q46jqqYGrlyqh7BIQOTj0tWiIOyQkeeODBMLZRMXbYJUcuMHXzjDe0ehLDq44kIGgxCcjA1IzTvOg+ +6XLp3t19173pnN+fkU6JMdj++++fDVGA+P9LVb61XKWV0U6cODE+EHBKOvjgQ9Nr4+Ll9z26ZV4Yp3L4o2BG+cD9d+dpYDxdfvnlWea77rprzCBulE499dQ8 +rqiTLnCR54YbbhhT2S/myY+K369pb1H/zDPPbJidP1LMwBmT5YOhQ0ekb+y1U+Av0/CMtU4I6B6xA/xxXPSK4PUjXHSGj7vv+ncE1hdjAunEvEjM2KUqFzxK +DQNRw3vydffOOOPMeH/7k/mD07vvvlvQtW/YahmnNmwltBroYY/st9ILD3rlV7o5Ar3DgV42jT68VLsAr9qfuuyMTvgJu29sQ6QCWiUVFWKAksoiN8NCgF95 +1TvVgdB17a65hmT+pwvy6r5zcMeNG5cZ071DqASm6EJgNemiaL0wQACUVhnkOIxeNJHgAqMKHy6pV7R21p4mTZqUBg8pYwODVc7BOAwei0OVxUT14SF4ONwD +m5C0QnAQtuvoEOTntZyDpevhfd933nlnxu2Pvr+ZOq2tLhOZViUwXEeldYMNNsgLzXBsPHTj/HZXNBRFlxevkA9lzZw5K22z7Yh09jkXZplts802efKFE5kg +8FWJw6Ilr8nkBp7Nzt19991Zl+gylrOWpetcJ2ngkwzmBTJdZF08C7Mjdtwi5FL2VlY5oJGdMED6leiVfXiGyz37He9/6KW037f2zkb3m9+MzpMfZKtO1R3+ +6MAhr9oDmTmv1wLGoEED87vFhw/fPg0cvEnoxZPG5U1EVaZg02e1W/nOBXEOVWgr9u4evLXVV1c5B7zFXspTCspq+SpcjkgOZNBY35cArAEROkGIKAA6VFaQ +swDulzDdUxcgCA0Qq3AxN/Wd97PwEAmx9QORV1eDoUruqUMQNRGA1k0khcu2FfAZqk2h//rXv2J1/qI8fWomDwyGqTvofdY1DRs2LC/itQkYPWK2MPMWxqil +M4tTjZswtEz9+vXLfMIlspmM8Gy+yMrZe8TalS3/bTv1zY9GULLujwCh62TdymBY6/vPf/4zy4jBCAym8QUAcjagdkydOjWTqptRFWsj54KFZXc8OqphwgWW +X4PeV157M97b/eOQW4qJgiGZF46NHnxKHJgjCyZ6Alp+snOfjK+44orUKwIO3dKBfEbzpz/9KeuSc7755ps5v2fP7und6e9lZxY4yJydFDmVBwfRK8+9zz8v +j3m4nz5/Pxu/lksZO1WkasDK44sM6KReKytV3uu1aXOtOIcYtdvINO6Nt7NM0K8sGsCiU7DYlpaDzYHPbum/3hOg6JkcjLXYQ4WFfsMbcHuEbMHS+tEZ+vEk +QIPbVCU3skOsNGpTfTzUmgXADBpxiMAQIA7XGOJY1SAoXEu1eEmUXekgHMm4wjafLycMNUy6ZBZetVwSHGeffXY2hoMPPjjPQFkVJygzPpnuoEnf2YFuDo/e +gr5R+mR+2Qtmvp9w8INeAsI/XjgYPpwLCvh66eW30tzZbwUVPeKLFL1iMbBnmjdzRsapvlkdj8DfdNNN2dkeirUw/P/qV7/K4z5KN2YZMmRIMoX/y1/+MrcA +psnRJ3EwvKjnUeouHa3Ml3cfMAD8oYmjOJd2HL5pzErOSKee9tvo6t0eL7e/NWY298rOAx7HIXMTNwIEw+NEtgtNi26vxW3jH0EN3wIF3k1akJ1WySGg0Y/d +AR2DRu9V54RkU42WcbEJB2PEB5tZvnzOKh7JXD3dd/rBP8MmD3i0nPhjkA6zrrrLYPqyBR3VyRw0mXZmF11jLenjD95OjeMTNp7zIid04cGvJ73pSVDGI7qq +zaIH7a7VE7ToEy76b5jUsyZVeyfuqUfG6Oa8sc+xNLNu5jejBnDAHJhFCGFCRrHKV4KVyR4dZXgmQTIQTPRerwuQOanjTThSNYZ8EX8I68vOJJoahDMKK9+E +qxtD6YfHwF6U171jnJxu/fXXz4ZiYZYQMfnkk09lIyoBIQa+sbsXfZyIYBy2GFm8VYZQ8YV+PE97Z0Z8veIX0U2bGNts7o4tN/vEZ1XeCIoXZ3zK2Qa0RXzB +nMGilwLMKnFoA2NOdvLJJ2dHYtCuDbbxZY1Muueee1cFmClTpobB2n7DyT3qEe+5iOl3yxFkZOxpO4t7nmPyDu3dR+0dC6135GdvLBFYryIb3SAzd7p7ZITO +p+LbULYxCVKWB8hSyySRm8Bkit65nRA77zwyO8DkydNCJuWhRIbPeOi+7pEkS3U4Ix2RZwlwxfF9+pTc8aDbe84552SafJwazZwGfVp1ux3Ib8yYMbkraoZu +REzKWFeU4GZfDm84ionurC+zajkvbI1NsQHOgl6/1eAtFJMPu1VOoku2jT48KK91rj0CeYKJbUWcja3jT4AwHMBbU0YKAEG4yaAAIhwGpW/s7Z/tw6OthRAU +xLxQOeVFVIQ7RCsP3OkfE6iESHhqUgdOv5ghgJr05XWFECzpy9vqAq6x07SIqNaO7GYW7SmUo5pKrcli5MMPP5T7+r6qHWjyk5CzZ7yfnQWdcBIOHsFwHSTF ++RoxGzQ3XX/dP2KP2kZZoW3atI5NtCfGdpXt05ZbbhFOsH5uGTlNr+geSWhAmxaV84PLcUR9AtfFki+Zbat844uSJTL5POTtVwSt0/XKRlbA9DhA0Q+e8hOy +bdZO1//r3/HpzyNzK3PN1ddkPZlSJ3/dadtrJDolT84F77HHHpvpdM2A0TV58uR4Ruqa3F22WdaakhnQTz8tH0Kga10auluxonRryE8QscWmTRgb+YrW68Qu +BsnDd+yLcZIvOeHxyiuvzAHHrhCBxeSJoGTGUcA5/fTTcxmblbWy0vjx4/PeQfqa9s67qWuP7vmNvmvExBiYjJ4uNAJsFS1SmaUtG2urPXIUwVN59mycqbxr +sjJzK2j9f63dCRzfc/048M8uzTGMOWabNnPfx4hcQ46fqBwpV78lqfyIXyLSsX45chP1JzkKueZIxJgZRY5KjhLDxgxDOSbn2P/1fH322r6tMdu8H4/P9/h8 +3sfr/brfr/fxUS+/CT3xPt50jwDJbyd2ChJtY3kF5LoQERGMdTAEgkHMW2/FvEs0DhAAyEtAAABwgghRuc03mMeOSoRnCYpZEKASYKT69pu2NBFbyWShKJQx +EkRbImK222XQrCN1WIu6Idhg2ZgMU+hL/76LJaLAjKm1J28hG4EhzkuoLay85JLLUoiGxcCd9bDw0xjj8MMPjzofz7GIezWR2fZ7vgxwiJyB3/ow44m11lor +Vz3Ii2FaoWjb18fttts2tbKtC8sNGtj8+d6/Jny27YOPe90jLNIb0U9jj549rXSOxbOhUS3hendqLM/q3SeE44F4E+GmeQ62CWICDTcDQ9CturBgloWSjB8l +81AjRoxIC2uFCGaCSxO5lJQoGUux5pqr5T4mdMdIBIeC1ZdSqvAHr/7jGWv5HIwiLRtnPMDx+BCg5ZYbnFqcEmYV4IfVtoSJkJ955pnp/lbkFg3gF3OjGR4w +ka3sqFFjmpVXXC7ut0EvtNc2OPAlehNgQu4ZGC2i1k98rL6k+7RybZ42WotPKNopU2acJ4iPlYFXSSyB0dDnaMvaJLsb6+jXVvIApEEVAkZmwkBKNaBj8kAq +oAAhj7Ay4JwoOvqW30eZ9q0V/FSTr9qrJD+BrXsApFEIqASBJhi5K1Y3CDdzDcx5IDYX0OBY0kEwlKYXADD432CDjwW8xiOt0GoDg+qL/GmWAybl3w7CLzto +jah/aK5WIETcN2U2i8WTNLYJREn/4UcqwRwaLoi+0KgYWHmCxe0Df+WV3yUhTll2ms480ZJ9tk7c2joeYIV/LnLZLiju2dPKkNilG0oOM8P15NgVbD2g5N1D +rIaARqX99tsvxz8idDX2wLTGTBSSJLjQmbgvaAPORReNQFPgzBhBgKneKYyZ0Ugf4AOeWCv4deAkBg8bH+H8AemacTe5QvLpg7JwZDEtq2MMxSJRzOCzil5C +J/QBi/Gd59x5i2rhGw7woHoJUCdd9aG17u38En6tMvCOh/GClf14QRvVFzBql2UDq8tEvfzaoli0q57uMqvYhTEAoTJMjiB+KwQ5ChG6119/OTvozC+n3gBA +I5JGXosJsqWXXip2QY7OxaZmrnfZZdf0y2mHFqDuGVUSgCizDQZtArwzIbhojygdYTSuETZmqXS6hEcZ8EruCVWGiDXjJtiuvmIimaAWsvVJ/8FcLkG/vn0S +QcZhe+/9hawLXAQKQWhzxGRdaMcSYBn9tljV6mkDYv2k2SG8nhd88kqYgoC6/8wzzzY95g8XJroALvcwoykKXkP0Ku7pWyiN6Detry8rrfLRZsKTE7K+wcsN +zi0QLCC6wQ9m4RZzTYwZ3DM9QFgkcGoL7dFdfgKB6eFx0qTns51FF+kVfVkgztCIYEws+yFU2rewV1v4B07hPNemxa81YtOeyJd2Mb9UOGCJDjrooHR5uaDG +wWPGjMkJZwoSDoq2YAS3oINlQ0NDadkWf+99f21WSqs0I2yOTmgGx+ChmEvpo7NzHfCxe/rreWtlW1e/8EBR6Jc8YMYnknIusBFiqbsjj+oFUswvX7IsEKTp +AGJ2VsYFUqmOLhoWSkgQ87NW3BlaQIhxmf4rNmeedXa8Vv7UiMxcku4GRBEcgEOccQZhss1AMh7imrAohXAAY1zEnzkVout+leFL77zzTgFLDBRfeyn7oH+I +igEtYISksqDKQ/CLL7VWhnUw1pIwi//mi4x51EODUyzcR4GEggO+IJklreQZuAo29+u3iBpFQ6td9evrmlVX6h+4NtZsw++WKAmUaIsFQmRr1wy0tSUtsvCC +zWVXXBeW8EfBYJvnnBHtj5mkElpum6tSwYVBK4FdMvck+GDFi82Tm262ZYhHq6BMCVhviEn14x8hdHhGO3CKtq8ED4D1bw8/Fbwg0jsDXmW0jVdMFwgSDRzY +rnSBZ213jiMToGkfxcD+ep2LoIukHCGhtIxZ8KcLjvCj9sBGecCvE7EIgbE8Vxkvgx//K6Nv6C75DUf6IxWO0IxcSDEh257w8uKL/5zuygGGuZ6B6FbzaUgj +jtuVEOq119q3Pug4KVdG5YBbd53VmjNOP6u5++57AlEfTW3NohhYQhx3yViD8AhxA4z7ZpafWwZw9dW332Bw1e/qHHjck2heIVX7XNpoT7scHoFffTUGlXF4 ++5133xeb854JmDFO+0aHAQOWbZ4Y99dEptUAwtraKm1lvGCiEjEQiQIAqwQOeSG84OuEsQRHXvflJ4RWtFvm88gjY5tHH3kgX4VJyNNtCPdN8rtPn8WD2CYA +23kuG/rAoK5evRZq/vHcuLT4W225Vbq7NafmubaLef2vCwwzw6U9qyAefvjh1P7myaR+/fon7VkqlgdM4NFfFgCOXOrjoXguVP3OGxNz/87AgYNyr5G6tOsC +B2tgOuPLX/5yjkNZ8FkJUcH56KOPhgs4YJoS/Fd4B2tHO30TNnUJiJmPq347KAVfYX7wgE3bhEQ/8CkBLGHTH0Il4W95i6bwrS6X3wTTc33t+npUplKamaYl +pRolDLSgZ9UoVwmT6JRKhBJ9a0hlnnmFiP/cAnVtGOHWm0ffEmB1ySUqfF9J/TprsKldv9UhaidZPCkVAgHs8t9VvzNTfGjbPcmErVXPhRD3Lrvy5niP6rMh +BL1i/86Q2IW7W7PVFpuGNn0l3IOxAUO8zS/6I91++x9S2L2zp+BVv2T9n60J6mY5uUz1rGAt+GaGUfmCU3+FmhETwRwh1X3BZXOALqiDYeGfBcWgxqVtWLyd +gPYqSsn4r93Q10b1bDyUaFapYPIbPJ2Xe5UKLvSkMMzlyNu62XYAtKeYsjzcb98UJ4VbPFN1gx1zermy5DAVrvi4cY/nJK97+i+/dv323fnbs0oFm3zGxrbD +m4S/509/S0EsPvUN3gp5+29zXylCAsBdgxMGAe59F5/4Xe36je/V4R6eVh6PeoZ/XWRFfV3WG7LhVA0BEhIJgAcIQcJdInbuISbfmCuoc8zppEnPpYb2W3xe +4xpr87cDsmcnvdD87tYbQ8v9fdpY6YnEkTalAp4rZaJQKNsAVFLPB0ngkVdnhw0bli6jMZSEqDQIhLkgRdK+BY9cif32OzTmXjaOgb0Jum6x5fnqsKR3Z4hY +eFjd8vt2CRETKBOGkvZdlapP9b/zWz5EoEBOPfXUZlBE2ISf99//wFhr97cMi6TrlvC126TVp12Eg2N0QVDW1DkNq6++SmxV+XFG2lg58zMFj3KzSp7P6tn4 +aWF848TFFlu82WrrHfJglmIeZ9GZAEVnPAO3hAmeUylM+z1gQCyAvfqK5rqYK9s+1sUNHz48x7ngm5ME7/rPyzDpbeL4rrvuDvf/Y7HNZu/E4z9j7I2P9Qle +4KfGOGVZ4A3t9ZkiIHS+8YyLNZMEQPTHpW/6rX11VxvyKVP3uq244srDiyAa14gGVKKwTpBcALTLZtqDBUlq+ZbKq1CjgAV43ePPP/b4xGbgskvn3I+om1Cs +QaO21C/5JozKG9AbmBYCM8NsPtSlTR1l5cwzcfEQFyKYbstyTAJaOkN4zPaL/HAnl44Q+WmnnZwrMG64/jfBmGuE67ZdhtgpEjDpo6QtzO9ev2lLR9zrvOT1 +f+ZU9+GT7w1WuBDdczbDccceHcuRls032AkoYNaiCyWgTsKkry7PR998d2zT/lkwfe+0pAbxFZ1Dw86k/YJBXfVbHnSWnzLlbgub77nnHoG7l5tfXHh1s8pK +gzKI8ny8/wgDwmtrmWYcCYwZ1TF/WNgH/jo2FOdOcVrQwbm6wLyVXc5ctw+awKc+NGAlWUOTtIMHLxc0WC5PmnrymVeb/su0x2Mbz+MBZcr6+I2X4K9w5ltd +eBUvw4VyosxB4enKlhzAcVke9XAP8SlrjIbq6hKrovNlzDpGgBBYZg81DBiA1OALY5aQ8deNF+TVYYjVYBEaYTABX/p3f3ig+fNdN8bAfHCEZgdExO2rub9E +u9rrRJjyynUSWb7ZJfWAlWWj9TCTsYIoH2tqYlVYWCCAX83yiMDdeuuteZaBgfjQoVtnn++66/asB3PqO+uEuRJpgXTtSNWm+yJ7hHfLCM9bPT0z/JX38ccf +T1xrWzI+MPfk1KLlYpv9xpsMjbvhhkXI21kMCIm4LJjxEPzzGPr379fcd//DMRm7d0wYH5KaWqgbs9oQWe1F4YTFf/SVSuFVP+DMNIN1guqneOBq/PhxMQn6 +UAj7Js1ndvpsKk8TtOrCULSyqRP5JczoEE2n4zok89JLL0x+wEtHxSpze4qkmXGTN9/nA34xfAmjsS/6Pv74uPAOLshDaBTf4GObhoC0S57eeCM23wXNWxhn +DAvk0z4hKqulfjyHh93Hg5KgWR3UX4qxcAaXfut7dwUhpZL/HqpIY779JxR+139IJFTuA0Lj8vsvQSiCsFr8+FVW6Nd89WtfD815Ti4m5crQ6CYCMV0lCFdO +KsDr2ey+q4MEpeZ7RAbfL4kYjYlIovkmjMJVcXD7lltt0wz/wQ9j9+mZGcIHYxFPfYIhNVbw35isfHPLW0wswodUmr/gM84aHm5OJaF97sTaa6/TLLzYwLTM +5pCUc/QwQWbtu8bRxHBDKQiHOx1nwjMvNsPibAbKghAJ5tDeUuEPfRDdhamM+wibvhpzuG9LheCP+sFN21oRMWHCU6F42jkp8HuGpvjAm/nkX3BBVxvBozQf +ePDvee7gBRecm0qg5tDwiKSOgi1vzOZDfjDiOdFSwQn8wzJZyDx8+PdSMaPlhRddksd6rbTKWuEFDcgIHv4zZaAefQC79gkAfKAj/JIDFx7WnnwSuD3X14Ib +LISQcDMm2TMIxcAaMc8DKbQgraWA537XPb8JkcaEEpVDrGqkAFBOgML5AgsttEAifocdd2kuveSCnIMRUBg8eHBG7GgazOBsAa4XQZxThBc9OsuBC3yS+xDl +u2Dk4pmUZI2vvuryZtE+KzQvvTA2uDDmc6aODDh6xNjj9Jy7MDdjDkP4fuDAgTkHxLUB91lnnZWRLtsURPMEEkT+OtuFL+veCI7wdCUWyvwTN3OroRs0V115 +WT2K7zhkPuaPYpPC9HsrrLROnn1x/ai7m1NPOCJcpT65Pk0GgkRQK+lrS6e3MxrHUgsGoLF2jfGsIhA1RVOp8MfVxiiLL96OHQjQ88/HOrVgPlbHyoXnnv9n +83gc2bz9J7cIXukVFvKhZq1YCXHeuT/L/gwcOCgW8X6nueiii5rxMfaSik/yzwf4qPzgssKE5bAuzxgVf1q7xz3fa689c3xtvvGkk09rfnP9H5pPbL62jcXJ +u/qNF+zr6tZtxsvqCImkr+ipPbxLgHzDAxxK8rpPCNVFiMAVbz1prYwHbihE0jBgFVQZSyUPYSqhI+UaxfS0oAYy5BllaVhzVFLN2Sy91BJBhDebldYc2oy4 +6PRYsvLpJB6tSPtZ/oPQ6sPwJQBZyQf8KCbA3MKyGKbugbXqpAwserVVgGbjKo0ceWP2T/62jHHD67m5jRsmFMzSmU9yuk0NTo29bG+Xhg4dmgEEYVxEN1OP +OAhhoailQFZ5SGUpjJFYCHNnn/nMjrH+bPtmlZXb8wiMmwKYwHNrDWjFe+75Y/PLCy9p3pz8dJ7yqq8G4Cw8+PS56taOVQOioRiJ+0nQjRXBx7ISaszXWQ7+ +uan4oV+/ZdIysYItzdXaJv267/4Hmv/9xreaB+//Yx5sc9aZP0k8itQddNDXp3sHpViq7Jx+o1+5/PiQ8jW3aCkT/LtHgZlOuGLERrEB8/jmO0d+u4njFJJP +8QNlMTksObiLl9Eaj0utpbGEyCGgMxa/4m04hQ9WrCLN8uOpLltsuc1UlcsIUJlVjJn9V0iFGE/CFPJq2D1nFjCFmS/K1jFFKi9G85ugKUOT6cSoUddH57+Y +g0XzAgiLCUTIPG8Z+T8H6wnE+3yUANp+LahRS4j0CwzGAgSBG8T62cekT7NLYNdvCsJYSvjVJLFkA5x6K0pI6bBMNKbxkhA5HHOXRJ1MwIJH0lfJxLS1b1xI +9zDMeyXhZAtnDznksObEE4/LoAAhAZNoJ7jUrx5jREJPwYHJvBfFAReCEvDNglmOA+cu9GKhMZ55G17D7NL48U80N4Zr5XRbvEAoKUiMjSYiuvgBT81Lgtvi +1aKJ+vRXIIvFt3qc+0qxHH74kc2vLr2q6buUl17He3ljXsnYvlw1tMTHLvdKUMvyOMjUCg4CU+1Sula+L7pouyY1cVYROlaGaQOc3wqpDBL8VhHEkmL35IMU +BCFIObYKwinr8kwez9XbCmac5RZIti7sU5/aOQa25zW77LpHEg/yuVjzIkQQWoRiEbguBMYgmsWzxX14jE30y5IUa+IIkf5Ahu+Zr2J4OKClWARuBJNeCeMp +B5c0PotuG4UQuUldC3H59Nw/QiSvMtVX9YjaGS8SNvibGQ7/4VPKU3nie6OPb5gMZKcrqwKHykrq1k/CySKByUJaAZNzzz03J4Pds1Kj1rSVEClH82JKygFt +JTDIM/Plvgn3/b68b7pvhEi/CZHEHSbgRRvl5zRpQ7JipU6OQhP0gRf9ZQGtGH/44YcziESx7LvvPs2E8Q9lf7TbMxb9ojnFgpZo9uqrkxPv5U3hV0m/ubMC +FhK8WNKlHzwENCRo3P+ugClGEJZESExPgwKyxhLuAVZ+gPhtQg7hlHG/XAMNkWz3CB+A5G+XZbRHxwqhbv/JT4dQvRbCFm+zC2EsJpgbRGdP40PbkK4uS1xY +H/NSxgDcRsyKcYxTtCOv/ivnu/NyD9zSxRdfktZEnwhoEdYzIXB1cpWUgQ99hgMCwi3k0xPqSvKxBvAsWe0hmIEpOmGY+be8ZUEHxthGgENfVw5XEAMRjkpg +YC0JA0ZmnVjgQRG6BxNLw2pINLK2wEDo4abW442JYIzkHrg7L/eVkygRdXITubSlhHgb1h9K8Kb8nKYqY5Pk+Bhr2RTIChdPVt36DJfG2SKQAiWfiCOXjdNz ++DKNNwgQwdBv43xLhcqzwrNWibM8+N9zdHohhKiU2cTYksO9Wyh2TXMVu8qosrIqMgIacXxDBmAhQGUk0iw3wXL5T4JpaD60e8qyXPJbOQFg9auD8NEEIiUY +7eGHJ4UgvZzLPixNkRBmXoQpK4kPDGPcYR2fQIE29QkcLqmYIP90fHhexBsx4sqYw9g93Lp2u4g6C1ZF1CsUa+en+S/P4FW/LSkS0DAHos4SFJvszCFVG/CF +Di5p1v1vGVDezTbfOpWUzYkUII/BNgnMXAmeBTHMqVkpr00WCoOwFoRK0ia6YUyMOmzYsOlwEX7uqPReuPKM9TI2YQVsHiwh8syY0HpEadb9ykfv+1G8CK9c +N0cQEE77lrjwFAj4PJcIDYGAg6Gbb5KvkTGEsJigpUP7DR7CZ2EB4SrFKVoKp3BNeeLv5YO21ge6Z9zIA0Bj1rs77QkIN1gUAJA0ANB8nYKmQv/5mACALELj +t44yd2+8McMsIpB61SWfRKC0A/gpAWzz7sQc1GJ2LooBPRNddWahufzQzmOPPTYdhhKO92MITdl/5Uhj+c8557ywJPs2q62xfjKb58qDvxI/2wp1cySelavE +pREF5QpKnrloaAEGsJV1gSc4xNDvleJxJnmWX76dpzL4RxcJE2AGCW3QiiCZA0NsS38kAQ/CJOmj+mrsR+hrDKc+4xrPJPDJrw9S/a4gijopFXRX1uWZqFqF +wLPgXH5Uu6KMvAwbQAmpEDhlzspSnnCJFhSbocUXvzgswuKXB9ytRwVOeFtwwZZ38T9lpH/oZVxIYCh6xgB+9XXy5FfS1QO+MngUX8N5V4wNkaURMZ9CtkgU +4BCDKASCFPrvmXsawQyEz4BSWURUp/y15EgZ2tgzAMn/cnRSQmSdsxyltlTo1LwmSDGnBA4JzLOrF3yEiG989NHHhhB9Jcdzf33gnrA6KyeibTIkCJI+wQPt +6Fv0jKWRuHyYicDAG+ttnERjixYKLCgvGUdhdPicnRLRhz59FkvtD1/3R9QM8c0F2T7embhn1h6ylARKhEsCk0Qh2CBJGRhXYiAJDBIcVtDIfzhE485E0AgK +IZLQXr+5rlzaLSPgUmsWZ4f/znpn/l1l0RPPYHRzSuaP0ESwyp6mmvS1/4pSWGaZvrFZ8KCI5F0c/VwiriXT5VOPOvEo/nOhv37ACwHx3G+XCK4kYCGvzZUE +teWZ6HBZHsRGWEi0atY3SYc4yFEZRCpYAiO/uSJmUMPyKNcyWEwahkSXdsIkJNi2CzsLWbCFFhmQwsP1OPDAA7IT6i6kzYzMOfnP36+I08zEn1U9+gVWQn7E +t49sjjr+Z83ue+yZb4SQn1BiWC4UIuqn/JJghvYwZSWaUxluDQ2KSZVXP8bWXmk7LhfXSFLvrBKc1LNBgwY2v7n2uhw077zzTlmfMRqmlwedfHM5CTRG5qax +kubvWEkwc1Ptq8J0olymIGh0cEkYVJAGw5x99jl5r4TJt2SSlBIxHiQ8LK5BP4VinCLc3imMWWgeP+z30iZcnnfe+c0jsS7QHJqxGWUi3M8a4jd0+WRu8+/S +jLj8t/nqmN9cc2Uspn08xrPtShEGBc+iiXrxrf/43pkTcMl788wcFIPjjJN4EP9jqznpIwBcOszBZBWBIU8BQgLgQm6UTqsUdWTFynbv3r6pG340OjVmwV57 +rT0jDzBSmUPAWO388suTm6232jA6O3+ug+PSfZiJlfAyK8tvKAtwvZeAlhBNCu165JHfb24Zc0ezzRbr5bgIskMXh2VrT50lHJL6iqnUj2kszWFZDHgN7lkM +BO3fv38+R1j4bBVQG+ZXx9ChQ9PFzYrf56P6INDwta9+pdkn3BZCYDVFheMLruqTMLfoHMuB2SUMPz4G7WghUIGG4CI4ZcEtWrWolmvWRkD3jZJTY0nTvtP7 +DZ/oSxjNhbF+eAbuKWGKhhIxPpPeC//5cDYflCFcsaZwCscjI4C0zz6tFd7vK/s35rBYfHnRiZttlb45p8mTX8lxoLHuXyKSedNNo+PE1ovifLy102o5GqEM +gjER2NFpoQXbsDlD496C8X/KlPbcEv+7GQoQFoIkuVlE8LsVgHYwpgOQ0F7t/ncIcz/4KRMmUY50c438RijAyMccq1M+QYc6EyKqTcDk/TBSEQuiaUtMI72X +VXIfrOPiVNE99/zvmBOa2Kw/ZM0w3+0Z3PrSNEuG8mgPROSidSZ9UwcLxD3ip2NcLoZV2NwmETXjv4Kh8Fn/wSkkP7uEPlKF388+++f539wQ91mqPPqkfi4k +AbKJTpKPdjVBSogkS5zkcdZFlUffhSIfgdT3DTYcGlbtu/EWjJ9M5xN5Xepj8RxoIhJogpRixvRC7mgvFW3yz1x+UOi8Gn2rLSS777F387OzfponTHElzzzz +rKy99rxRBKYBTDILOpjvuuTiC2I/3H3NCssPipc9TIy30bdvTS+XD40ID74kPPDp2TvvtAsUDIfQgTHq6oEO66CCJNJ/yPPMAK01e+2bxBO5oX0lMXT/dcgy +dmMeZWlndREYCZLtngUIf7RlHq+AWbQZH2+rQNhllumXVikLzOOHvrRtNOm6QOKsEgaQD6wPPvjXZvDKH4txx7PR7x4xnnkh+8ZVZbqb5tnUypjP4Rt8ZOWq +nWKQcePG5fhp4MBwvUK7SywTF8tVB5BU/vqGY7j8oAmd5luwf1iAkxIWruINN7TjH3guYaj6MbkwuFUQ5SFoi3ATYv0YPnx4Nl9lWbitw60z2azM3XeOiZD4 +0Hj9zgGxKuSU/xCmwoVK4E3CR3jiw0jwLVFY66yzdk7ADgihkOzm7jb/siEsD6Qi6x4rQsaHxTX2NC797ne/m3QwKc31c0aiuS6LqC+95KLmE1tt0oyMVfQs +Db7lfRAgQgvXhAZeWqXSBlPkA5Ooc84jYXJRDAzN/VIBd4ZGpokQxpsZ+IYiduZ9aIQpU96Jb0f99o6XSA2KQzv6xPN2zKQj3AX+NqSqm2XyTYrVu1SMGe6+ +63fR4SdydbEojMEt4IqYiaV5+Hivuqp+zzH6Gmus3qy31vIxnuibg0qIBDt8YJBVVx/SXHX1rxOptG6tmChGrfoQSQRyn332ybCssYgJYRbJs1IuylUZ3TO2 +ImizS9Ue4m728TWbXosNziVDm222aczf3J6nAnXWIT/iw7mxEReH/4/JCI5xkgCJeSZ0B1MnzpRR7oocAAArvklEQVRFf1Mekn1Iu31uz1gVcki4zSdn3fLD +ke9KykhC0AIr6C519jlvzOFHCasIHQFfNN5v22epFXKO553X/9FMem5SwqFfPAN9FSq3wp4LTKCMnyhDJxdR+jyyE084Lk4k6hf0bjfvkQeuKfyZFtCuvATK +ZRcufOFtctC1zTQ1fUSFypSZPJXRPRlZJ89K0xAEzFbWiwA4jRJQAMCA5pAQvEV0K82Yk6/qea4ja5aKybPfBAP3T2KafZfmBeHKglsyIcu16UyQ4rmLFhci +3mbbHVIhlF9Pw7GkNLHx32K9eyWy+dciRZb/6Is6OpmIMJhLsarBQk2EY5m4mfK5MKfvcnm5dDQld1Aq2Dthnvk35aefhP/n55yfNBK9MtiG2846iqmtfnC2 +nvGL5VFgNQfjf8GjTjRW3oDexGb//v3Ckk0MEBZOoTKWsFL+sMO+GRPdx2U/tKGOStW+sSG3zhhOmhe6Vt2+Z7QX5yxMag8MtbB3nQg4sO4UgyAL5jd8QQPC +TLmbpKYIudwmdo1rKZofn3p8c8/dtwe/thPxlADe9YyBADvawa/VOerVZ7B02XzoVlNFzwyKCYqZckKjUM+e7Wn7Ag2ExrjGoRdLLBH74qcJVqAmgdOABHCn +g77xpiXmr+d/9eZ529P8WgwEuNZcvtuMuun6tETmBL70pX1j8vCOrAsMRZAiQP3PDB0f9dy3zjHJxkYiRxBbptnz0pyXXz4iX/TrRc0UweSYoSY8EKOMOigK +i26vGHFJ84vY9/KFvffKVk1o2s1brloxUdUtk3v1v/Cj7s5E8SAYi0SgO8t05qvf4IcDxPzyl78ah3e+2Nx6x4PN5bEIeIcdPplzKYIeJlYJRLVXcKiH8tRP +ilKSr37njfgAL4Vhzsm474ADDw7Ld2/whBeBtRs/F4vV6jfe+NsYW30nJne/m0qnE/76TZnZi+SgF3BUH6qtOfmustxUK1eMT4S8d/zUTs1vrrkq+PfZcFcf +yDEf+jkKDW9z5QSzRHEFg4TlKTeK0sS0cZzx4e57fCECGU8FfzslqN2sivfxPdh5FBSo3wSzR4/WS+tK83pYx7giEsZvEd8SDdLlw2y5rmhahdw3HVNeeJNw +IIhwYR3VRGgwIxNMi0rKuBALsqUxt96Wqw8WWmjB6TPpYPHc5Xfn/6rDM7/rObghkNknRKJFhKjyeS7/z39+bgrRNttsn7PaloGUEEGUPmFCixMnTHi6Oexb +RzR7xNvTK4mEmTuh3dSt3qqbUBiX+V9JXS44pJ3Hh2tlTocQWf1AiMDVWabKdn7rp0SRsRRedym6eNVV1+R9S6EwGEWAFgWXh5buaN841TPtSfVbJAxcQuCE +iPtEiO644w/NT844Lcp5o8eMOZy3AmfexH7MMUcFM/5gOt+U0ihYBTSMKblZEnxVAoP/ddX9+q77vvGSOgUM7PdiUR9+ZOy0rMEfPZdNHKO/+TNuLAttUa7o +o+CP9g488MAUqlIy5p9YJumLw/Zu7rj9lmyLwtCXt8MjM35lWYuO+AMe4SN5ZdmPDhqugy4SpiDGx1TGQwAHmG/WxgywgrSasZJn/ivrQkCMqEGRDs8Q89Vp +wuW++nWImU1ado3BXY9u4XZslwJpScsmMeHJJdOui3BzuwhF3ev81rZ2WTXu4dChQ3PVs2iWtlzatkTklJNPjXVxB+REqxORaB7MZKCpHUhz2R9knLhAKIAT +jz82cVLumG8D2XHjxmXol+A6EcmCUCs0hIMLN+pk6e3E5ZubZyGA2lhppZVSM2I2zDI7QUJsfZGPxf3lL64IF2aJONnptXh9ys5ZhxNhhXzVD8djx47NBbq0 +uPsYkCanjfWDK2pbuYW2xnKsJNfVIJ2CPOHEU4J5Xw7L3G7eU4YVU/8///lis3acXXf++efkeRebbrJx1tvZFzwgYobu5tQkzyX96KQj/it6zfxcXuV22+1z +uTRI4Oeoo44NX9g59fM3Yx+6O15APSzm1pbKCCVlB0Z0N1/GIuET80qEwrpL7jQDYDsJfGyyyabxYvBbkx9sUgS7dXfOrnfEGFgpMXD6DQ/61Z1AuKGxuhSG +rNJUhEMHCBKB859LB0gRCx3HLJhcWb8xkcZoD7/VVy4jhCgDcGXejmeeS7QiRrg0lqr8IAbD3sytXeMNzGfhKe2tnMTFsTkN4WlbDGveiIbhw0rgAAOmsuv1 +5JN+Eq+e/0wcy/WPFI6uXd9KmPWX5a269eOW0SPjGOF7czJVPWCXCDUfW7uECSEIlm+W5vkYS10+4vLc8yRczr0AqzGjMDR8wjfGpSm5iP7DC3q8X6o8NGIc +ENBMfPqZmIxcI3FvYtsKB5OoYBNIMR4wZjPpS4i1R+kYjMMJnPpPoEwawxX3R1TLWMPA+onxTjcdGDC2tCxlW0y10867NSee/MvE9VE//EEyVzEbnHGFCbB1 +iIIxnelPf/pTMjfrgX86E9wSQgIIj1bsb7HF0AySjBp1c4Tbz4kXCeyQ7Sp3XyiX3T//ucyrP8ZJNWanFOwwEB43XsT78GHoYi2kNyEKlx94wFdyk+AWW2yd +Lh1PyuSr8/n0F5+QAUIk2IZe3T3A0P5gWESCXJmNnTTCOnmGESSMJp/juFJig8EQAnJL6Ei+8qQdQ9IAGoZc7fmd9QRwk55+LOYzVsq6aW2CxC3TScxZE3qQ +iAlE92oLOcbFCAbPtCrhdhEo4VsMCgaTkYcedkRo8HNjEecOaWmKCUrwaWlMRCD79l06tsVfHIsxR+QAthgf/JL+YhDMj8guTMhFdn+p0IomA+HJ5R5lYfVC +waYeZWouD47knZ0gKScpK7351jvBAJvk71GhVA6O00vBgzHASVi1q2+UkGiVMDhGQldehqgWutekrgAEZQTfG21k5fbJyRMvv2y92Rvxf2owa7uO8l//olxf +bz61w8ebU864OI///dGPjk6+QWd9x0/GJMYjlAoLgXcI1vhwJ8HL/RKmF8qHZxbeIlwTrCKOA2NKgZKyhk665NLLYg3kutkv/LbGmuvHa25GpQsuakeJGAvx +UrjP+Ep/999//2ZMrGhngeFCWwRO/azVtttuk/UHJYKX2vPtwaovVvzMN6Vd/obXBeUEJ1KQEA5zQ3S5SCrn2vHh3cMECkLIWxGdo5nc99+FCST3aF2Mx2Uq +BlQeINFUEhfDeDZ1amvi11ln7bwvigahEmRDPiT6jWmZY24UZlQfAqgbUkTICJF6EctAGZFWXXXV5usHHRKW5cFGYEEEitCDVT8hlxlnbdXp9zXX3RLW68jQ +YDsnLDMzN4FhkSRl4IDmNPCtxI0aMeKqZoUVVwgNP6T5WGh5DCofhoUDuIJj9RUOq/zsvlmkjy63RvPaG2/HWX0bpiBcEqsLfhoKpRKBoQD1V9JfeDTrXys0 +3Le8hmXgaoIL/sAjcude9DLw2u79IRjcen2maH3Du3Hmrp/erPntDbfEoS3fbo4LYdI/iTVhDQQdKEj04wJ7zhVXJxeURYU3990j5NzhSvjg/PPPi9D7SXmY +5jnn/LpZpu+SAccrAfOSYYV/G8I6Li2OSWZCjyaE0xhRG8Z/BNZmSm2wzISW0j3//PNTUI8/4aTmjJ+cG9M6/ZIvil/h791u7biccMElBdS9ZbzWzWoDBG8m +YQGw0ELt6lhjh1dead0fHRL6XnTRBZOBlSFUNJTLb+Fy8w4LL+x42PbtBQDGjL7d8xuB+/fvp8pg9hg4huY0H4JBi+DWiEEEgAmI8jMzgfI0jzcrmL+pfNwT +roGx1vjxTzWrrrx8amKTzISP4igF0qNHrKmKqJ1+v/zy5GazjdeNicf/STirPu2UQBHQCquDScIA5ookQnr55VfGLP9pTY84+PHtfzk7bUJaBAxr/Rni6BeX +gxBL7n3Q5PX0Tzz+TDNkg5WDUZZPQTDAJpiFP+Ft7gqiV3u+qx35CAIPAKNVgheRLP0U+frGIYeGS3xCTBN8Mt0d5QmafsJj0fell16OMdMqzZVXj0w6/PD/ +hoflXCIFSX75yptgLassemB2V2fi2gsYsaSScY55Oe2ut966gbBJefY4RUEhNs0z4aHcFAGX/bJuXg+vhVtpPst/3okonkW81hdSzHixPBeBkU9uv11z2KHf +blZdZblsF93Jinz6gCfgwH/fsfr79bQ0OkKyCIKlO2+84dXprbWgaWSGXN+uYkQdqP9cIo04QDBYLvMkFPEhj8Z7BNK5VKJhiy++WHPltXc0X95v/2Qm+15o +LUleiem1grczeVYXOCT+NW1CSBFHovWZdQjo32+pZuxjjyeyMS94fOsTS4uR558/gh6x2/Ev997ZHH/c0dMZUp8kZUqQEAVRJfX4b6yGISVKgRB5WfBuO20Z +d14Pgj2ZjFJLlqrednrhI1nug3wUDKYZFuq9cDNk3bWyX6Jt9Q6mqtt4idBKhVPlPa/LM67NY489Nj2Pe1xdsEnf+N+D4h23G8R4I9yZwFkbEnameht9pYDg +Hk0FINZfb9VcsnPaaadnedvAW8vWrpgHiy3wdQ9M7hFsdRZdRQ15IZUoHYqVpRw4aGDexk9oh5d3/NQuzSmnnZkKkTurTyXsmJ71YfVYI5aQUqZsucnGVDwb +7iQvZu+994jyTyRuKRt8QvjxSgkQ/mc1k0M0REp1gL/uoXERawM5OgdRvmk2zOf+/JGPufebtmGOPXe/hE4nJXlIvAGbt6v9M849u/Y3VzUnHHVwvBnv2Gwb +I5a2AriOCefSqJ2pGMA3JpZoMm0IVlTSL+0jyiuhHHqFxppvvjgaOARNH8s0QyLYabTbbhsdk3fXhgZro2hVvzqLgf1WN/gkGg9DEGT1SmPG3BarDgblYfci +f1yjsWMfy8Wr9vwQcAmDmWcZGP651NlG3nifD23FuoVmyaWWTCZkvWleST1OJoUTOH00GGpWddc99ON+oXHdI/C2X0sE7cenndjc/vtbmt6xkqV79/aQSrhE +a/QiCCwSfFO+ywxYpRk46KNZ3pQA5pSUwbzczoK3aArf2i+8i6pxBbm+kmeWNI0PS7la1Pfzc84NT+SSqHO+fPUMfnz4oXtjfHRHWLGNA79/T7pqTx/ABk58 +rm00JKzm3QgYF7c2Mh500AHN2Efuj/LtYTLKgRP8+gsWcuE7XsbcTjzS2hJm1xihcs9vhZhE//3u1QtDztd4J+u777aWxpnfGkEIkiv5TYKVVU5+luiaa64I +KV48GOjuCBwcFG22YypSzqpJQrDMucgRzVidyIcdHzrhGXj4uFYISOAeNmxYjEk2yHmim0ZeG+20Lwbg+hAil7KQgQAjLr843hh+XE5sqkPdMyd4kcw9OIVI +IIR2MwiuPUoTJz7dnBCRwXXXGBzvK3o5iPZys96QdeNk10sSR5aoIKrgCOay5KhggcMPmmjh7be1ca9dNcAVU6dzDQQazNgbh9De++9/UGpRdVcf/K4+0qoU +WS2ctRJAPRhMUobbfeJJJzfX/Pr2tDwEGX3LRfMN7xQSa/n0hIdijLFc0pTi0edKQus25KEbGhQc9bz+E3DjooLLc/yEZtKX9vlirs7o1WuBEKiL4/7rcUjk +Js2PT/9p4LpbRPd2SpfVlApvgZvoG04EMQi48aCxtvk3sJjQdnjO6quvFng7IM7pezjga3FQMoC3XfDCQsVW8/YoLZoLgBiqtIEMMpNAHXPJx/0jGBBXwmbB +qrySZxpgDSBK+JEr8GJoq6uvurU562dnR0TsvBiAr5/5lNOmLcTGNb4xu4gRl00qxOafmT6KMYxPRMpsnRDiFLpdffU1kpms3sak6nnxxfaNClxZ4z3If2Ts +483nd9+7OeB/9s/aZ0VcD9ShPeMjbufwCNEbh5ivUkYaM2ZMM+GJOHAj5qHgixW0hu/CCy8LAp2d7pdBLxeFRqQw1DknQqQduF1xheUbRyzfEdrU6u2BYdlE +tbi16BPVxkubv9fcdON1wYwPKzYdTr/hA9yEgiam7blE8CjIg4aeF473+eKwcBW5eG0ov+6rB/yUIXq/GpE8qe7Zyl3WWr3GsxSf9F60LXwOHTp0+hZ6giqs +b4WChHe42Gf/7P81P/1/ZzY3j3IW4Xwh7Ffkhj+BBThm9QkyHjHm4+WwtqZn4EkAyPRKeT+ESj8I3cQJjyQe9NU9NAWbC37ITZdNNt1yKiZmRWhFiRaAEIkm +8buEi+AAlAWKQ/Hyvufuq5glUQ/kyGeNkjTmlpsiYrZLM/z730nz6R4kQHQnIvm0td28kKUDnXmUrVRCqIO2iFuKxDpYIUyTMfGf2+PLzfrrrpbv8ZEfIriu +yvTo4R2i3Ztbx9wUrtejoZkGJ1zV32pn5u+ZYUJgCseYYs+9vjgdZ3DimUSZ3DjyugjDfjvcib3CQrY7g4X+4Wpu0umn/yTmb86MMeASzVE//F66M+oB31/+ +cl8oo+/nZO1j4yY2hx/6P2EFDs5mZoa/2sZYXDy4xyCd+QrXf/7zvWFhdwiBGjI9QIOhJEqUAl44vJbRo2+MKN3RAcO3c+U5WgskCYFjbsIOP+7PKtUzPDUo +xjbnnHNOhrTxpwhtpYLL/3vu+WNz+BHfifPQfxfzS1vnYZsTJz6V/bG1Y2gIJV4V8WRxnTXBxdNPgk2QjJH69l06ghlDwqvZt3nmWYfq9wjPol4y3XpAKUDB +5wJrXT6+8dAUpLrJLBsv6URpEBKLGTAgADSK0SCPr0lwCBzGLOvlWffu3Zq/PTS2eWLcQ+G+/CKsyy7ZCeWlmYWjk2iev1c+z6RCICVw3HEnRlt/b755yMHN +wNDK4Ln55tHNId/6YQjRygFbG7oHn3L64x2giwTBf//7Mam9TGIW8doW3v8TfAWzbwxhHsfWA0EGS0zAUYoK/vr0WTz8/VeaiZNeal587pFYrnJQTJS24f73 +b+3fn1a7F154UbP3sG81n91l83wh2Gd22i2UyIAYJI+LMeh1wfDrxUBaVLCdRL/qykvf01WuOqulWeGi8rCsImPbxpzc008/G3SNt6wHg8KtfsKFNYqjb/tL +M+amyyICOCQtuLpZ4BK8qq/anPm7nhuHWqXO1eSloGM9U8YQI3g6eYoyu/jiS0N4vxKrM86OUPe+uTmRJbTaAU2Mmcw9SkcffUxY8yPyNyXSJWD31slzzzu/ +OfaEs5oN1l0xFF1rma1+dwwDmuovb0aUusu66204lcDoOK0oA4EgWC5zRiZWAQ2xJSzM4kvx/hshY4JUbiBB5LvzYVmhL/z3Ps23Djtk+kCzmD+hnsWHNgpB +76WpFKt6vHn7sG8d3px7zq/iZJ2NmttuvcXTrHmJvis2Q9ZZJdd5ERzjP51nQf1fIiJAI0ff3Rx/9GGJdIWq7azgA35UGQz0jUMOa24YOTpcrkEBY3uwpuct +g9kx7EyA3snMF//qlznhu+uuu8yRAAOrmPz6GBNuH1G5XXb9fMDergx5ZfJr0f5yqcgM+jEOl/3mUTfEMqXbcqxT5WfuovsSJTezonO/+ipCtt9Xvtb8/va7 +m0EDB6RHoywcsPYuzK5txxof8a3/DZfq41nnnyIoYm3gkd/+VrjI8DRjOKGNmVO12Xl/Vvc876zrzjvvinHQhtHW1eGubp+wd1p+q+VFBM1TWTp2cMw1Xjri +5mbNNT7a3P+Xu5rBK6wZeByYARRyoW8srn4yLuSDQsDr3e0lctPs9FtvtZEIA0+ZNEpwjCUUnmGNnBH+z6ignQdQkbwQ171b7O+58754QfCTOfP8mU9/Khsr +AgHo/dL7CU+VK2RxD7556BHNtdff0uyw4zbR0SmxfGSrUAbds9PBbhntoa3UC04CxDXwcuFnJz3fDNvzU+E3D8uq32VVqLU5TEVUs+JnnH5qs3UshDX2nDKl +3bYP+RjKpCHEvxBvDKxWKio2h01Oz24xsEQLU2oLLBBvWwxllgPg6DM6UhwCRE2XpWJB6pgUpPfC83vdrwYJFwWhne8ceXizylpbx5sZe+f6Nq4PpVweDJ7g +sSwWIfrddtu1qmi/u/SJcyTuCyt6QQ708cd7ta1Nz30Xrmcl5CrGX1WX9yeZEzo5zgE/59wLmg0/tl7shH0y+Xn3z382LaOAgwXGX/nqgc3fxz7Z/Ne2GwS+ +3shXfcKdiCu+phj0Cw+hJ+OBl/zmEXXZeJOh+aIxAOo4pNPcCrM4GMIzhMEELA5g+a0Y2mBdOZE3SPv9726JSNw3m/854GsZsdG56pjf85LA4YJwnT/gwG+E +//ps06/vUrFDt13z1yPCoJhWh0UInQ2BuGCFCGFUUUBWk9t5803XxlxC+1rH2Qn57GA/5tgfhZY9Ig++ZO4hGLxwCUcSHCEQmLzMTNBBxGhOcVT5Hww/f8NN +dmy23mJItBf7kATEw/poW57CGcJLzzz7fHPbmBtyoO3ZezFkZn6fj2r/yiuvanbd4+Bmh22HJANy555//oVkMnSCd20Yi8KDflNsr776SriEzzWvvzmlGTXy +6lwiVgryfZqdo0ed9d0cgaFP5Hzk/M06667Z3Pvnu3Kz5rID+kaw5tZYJD0khMPb0dshQDu2bwUYvfSFbBAceKu6/YbbFCSCIyNhIWU6DlEWk9LcGFNBmpx5 +w5D+Cy9ikPnn9yqPsbEO6c3m7DNPyvAxpKlDXXNLrE6sAVhSl0Wk23/yc/EWuaVTWKwZA3/rx4YLGhqTUCzUa9r5edEPCazgtxXEoF8oVASnmCIzzeFHleXD +D91yh3xvLmIUDgm0PPAKJ3A9efIrqRkH9O8Xe2iuDEvfO4kzJ3iqdp98ckKz6257xQsLvFql3cQGF4iPTpSfeTIKhJt+U+wf+u1vr49AwXbz1O8SQvg8+OBv +NCOuur5ZafmP5pHUYNMuocIj+AcD4hk0oGSei52s9rXR+Pc+ML558C+jYxXBKvME06xIV3jyTCDlxxGcOeboH0aUdXvMFHOAT0U0b+kUIPRxoQO6+Ta/CHYW +3zcPDX4LxxQWXstwCcsjo4KplYNpIYgQIQgmAJDKVQSJ7XhjgXj2dnPdtb+ODW+fbf58zy0R+fh0AlMdmBPmmBUi3CuiqYuPv96668b7d/rnRFlLpHZzFW3n +Mg6CAALl1SPy6APGWjLc1htH/r457/xfpBCpG1LmNd16W2yZf/zBxJG+014ubfuuPmCmxRZbPP53i36s3bpc89C4ycLei8a6tHBnMS9GKKYFhz4/9VS7QNVY +d/14Edd11/12miJsx75z0zxaqF9bhx76zehPWJ0QEnhn8XkuXMBSIJQ0/nGZu+QqcaFEx7bcbJ3m45t+srn/gQeSFur9sBLawr06RQytTLee8977H4nJ+2ci +mvfRdLnxM0NB0PUNHsmCy9ymfqbARH2ekwu0lZKHIJ9LwOWQZPpXSFm5cCqRMTNHJRpSKW0zctQ9sYnv9Rhc39j8IF72ZLkFgCtvVjiPH+oDk0STDh26eb6m +A3zOjdAZK3AriGDvCGadEorAPqJecTZzjokiP7/+iQkRBj78gGbPPXafR8hmuKxwdPHFl8WLpzdPzUbZgKu0G2KCAVzw3C3GkU89+VIQcbnE5bwAgqBLLLFY +WBz7w15OwSE8FKF2CbF24dBYd7F4idrIm9pzMrSLVnOb1I8+DtC/4PyfRMj5hmCwduoDj8ABOLhJ+s8FAhshUpZQPf/8P9KlEhRaa82NYrX/Xz50YdL3gtXv +bbfdpvnjXTeHVdo83Otr0mBw68gB2hF+sIKdFdNHF9gpLr8l3+IG3NaukKxjfSKCRSpJnfVwCkEEQmCI7nHffw3SJE5e+b/vHdhce82IZttttp4OKICL8eeW +QFUOoOqThHlFXnbYcafUBjqIIQVGwIhJdNzbLiBigbBKtKBQPrjVxTdfJBTAwQd/PRnYvQ8DVis0TIraXAYWdbKMmAlBKCt4Nsnsvjk4pxJhQGlu4Ci4Kbb+ +MdHYtYsNk+3GO7gIEBIOCgYOwZLaNsZQr7zqrX33Ztvz+lFw4IHvfOd7zR133R/80s5LWnNJ0N8JV45rKTgFB/gMTricGNM0QYh0M2T9tWNX7lZ5mGUx/rzC +11lenRQHfJsMtwjXGO/WW0eFNfx78MaMI7fk1bcWd22ovf0d+O7fP3EJn5QoVz6PLCY8GkAATGfXKN9PQYTABD0DITT6yBuuS0AsJ/nud4+MSbW++f/DtEI6 +zy3Qvk6f8ZOfxgLCvTIaBukCIJZ1eEaDgA9smLT3ou3B6Z7xxXWaIqApbxz5x+boo74fJn7JLKP+uU3VX8xx1dXXNKutvl4qIQwCDrgEW63ho31pZficEvDG ++To5ITm37Vc5tMOQk56Pw2dikxn6ga1nz9gwGYoFHfUfXNo2R7L0kmGVYoU0mOBA/rlNmA2uSe6BsVp++UHLpKJVrwl7CtncC4UiVIn5WCj3wVdKDiwmPTfe +ZK3ctXpnrEQo+s8LfDP3q4QDzHC3006fyYn4Hbb/RKyAGREwcO1mWBt5uMQ8IHgEE0UNr2BGY7jNt1HokIfmGiRbdYtRPcOEQsuE6Jhjjo2JvitzCY68AFJ5 +aSb35jURDNaGdv/hD49uvnnECXlqDUroDCIUExjbgXXJJZdKYpllrrV570Y94Lci2ZKRyy47PXZnrpeMA0Hzkoq4Qt6XXnFT7JsanAKuTiFXcMqjD8lU8f8f +cZ+VhMvV1mg3As4tDPBdMPTps3iua0NQwqr/2hVUca/wZdkOS77Qgj2bc35+Vq6M1n7VM7ew6B/FR6BPOeW43FUsigvH2gZLegmhjDGe3QXaLL7zDM3hxY6D +DTfaLPZX7RyLSO9M3Mk7rzDO3DcwqxP/Ws1y0knH5yTuqJv+EIbE8qdeCXO5eDXE4ZoLLFEAaOA5xdm1VywudIN/OGXKu0mIBeZnhdq5AhXeMvqm1Ph2HB5x +xOHpTgEAIAD6MFMrRG14/RvfOLTxot2ddtg0tYI2Id38EYZkbbil7tEQyaQR/Hj11dgO0j1WhgfzckUfeujR5vvD/y8OxGi3aHwY8Fa/r732uublF8blspra +ts99of1pX8yCebg4NDTEK7vuOmtktG5eYCnm4qdXwoyIjYFpUrQFh3a9nduqbcIl3XHHnfktz7wmig88G8dK8x//+IxYmnN9MmMxXAmUaZPaaeAe/BB+gQDf +Xbt2j+9Fmp123jRXrY+JHdOF6+rvvMJa5fVb3XgOjj7/+d1iPeKdzYrLL9dcecWl8czzNnKHpvqC15zlAZ+UpXLo2yVewjSVdqc1uCEemlhbMFylKcG4t8Ua +tFNOPa3Ze689p2t6HQKEK37Gd2k1BOEmeNZ5r83jmfydqc3X5icotJiVuQcf/M2Y2L232WLzDXMgTUick7d4jOW0zxLJiyloC1qhiOK3unRw8mQv2F0gXwBt +HAgRylUfCn4w/fvv6CNfRJektlv5E+LhyVyWNWmbbLrFNG06NQWmcMMizD//AsnYxijaxHCTnvtHjCu3aGx6AwtYlZnTVPgSxrfyfKOPbxp4aN9ACD51wwna +StwTzIDO3N5u3bo0F114fiqjuYWhE+aCx1aKPff67zzhSESRkoYzys74qPBDCYITTBLayAdPGFh9t8R6vdGjb8lzGtyXWr7rZKSWf/LhLD5m0HpGednqvt/a +kggWeH/1q4tj+/2Xmr79BsdGxdWThmCrPhYsBAqeu7oBscV8KkPTyWF+lx2wTKxsfjCWTnw9hagayoLRoA7pcPsttt5K+H/ea/PQkADtvNp77XP1WgP1+d2/ +0Fx7w63NumuvEh1oTzei4ftEYAFj0Ljgtr4L8sGOUQ1cPUcw990D28/OOiODKeBHuIK3E/7//B39m9Y3zzp/q0O68aZR+e2/tlxSuTLojkn0iyYm2G9FpPHP +f7y9WWnFFfK+/PCRbWhnDi71SrR59Cz61vYXrrQLDxQOXGBgRKdB4a9Hj24xh3Xb9BXhcwtDJ7wFD4ty/HHHhFVsdxSAB4+ZLwODyz1MCSfgM7dGIVKMYGRZ +F4qI63b/Fa+F2XrvXDdZbelz/W6/3XnvJI9UZSpn3fe/eBJMxtv77rtPLvrdeKMhsWnz0YCnXUsqH34DvyGPvhibdvdAYZXqkP/CeQ89Mi5CxLvlDDifEONC +FOZopXHWs+KeFYDytr/zR5bL59M65TlV7x7kcdm+8tWDkmm329phiSyPhZ/tsnXtcweKCEK+mBiRlGdZaXzLbjDR3x8eGxZt4+xsbfjTpvZaYW/7rs8JSfjn +AfF0mFrYMfe0PscPCxa15cTVK664OqzRVlmGELVIjfBuaH4BBeUJELyKLib+ovyQDTaLflh9YTFwe0Z2iwtQwFl+TmvX7xan6AQ+9RpL+M9ltCN10T7LJh5M +cOofWAgTWPUPntponlUGcfZgBENWXX1gbstedbVVs19gqLa12rbLZWu1dd5JmlU+d1omLRhb3KLBUqEIV8+JZwN4cLb8046N0IqFBAd4LOvq3bt9GwqYaX8J +nbbfbr04FWmrnGYRHdTGv6eWZokXD6Y9b/E0c95/Lznzv5IHbay11pox5jux2fHTuwYePxIKa+kcPqgXH4Ibn+HTLltsue1UpniRMMFvB8NKiOCaMOGp5v77 +7ok7sft1sV4R7VksGeCJcY83q8e+FZL5eCzPb6NQsRnvhfFNjwWWad6O17lEvJytjrLqDDesp8M+woS++XT8j9Q9dl5OsYkvTh9aeNlmicVjLdNbsbdkVYst +vfjYXEzspwkadg/BxjRgorGKEUrbG1zrOE0rj7wIZOKvR4+PNPf88dFmkd4R1Zvf8qEp4bb2DKKFGxuC+chjE4LxLT7s0Tz3QpyYFANx48OXXmnD2Iv3XjCW +Oi0b7uWLKRCYggA8Hau3N1p/1YQFrBhF2yyitsEE4c4DAG9p2lIEk2Kd36BBy+Z5A8o47ll+jGkuTBluISFk4VoF12OasmhdNsufMN3Eic9E/taVgwd4UV5b +ymFK4w/Mqy1wglcihP37LRPPrMYIJRr1AYPrRzgwkHWYxlfqRBv01i4tTSHQzKzhfPFcYXhXL2/CGFUf4I1ChCsury0kYLV6GnwisVIxst/6ASfgEYX860OP +Nbvtsn26osSDYJovNPa0RjJUcsKpDnW/i3lkjPIm58FeSVlrK7vFygvKDM+4VwLsW3557rjjrqQfhV2H7nguP7pqq8tmm38ix0i0PARLKtUJYcrS/pBJcN6I +OSTIRjgMh6ns4e8VQQllxNRLgyEa4YR0nVww5jlyYWhoGR1wVhizCEBEhmiIeSMYAAzFPJ5zUcxF2JylEwl8IE8bCCy/+2AwntJJ7kx1mNtXE7iiePUbIcs1 +ND+mzxConP9LhxZSD+FQt2SlsN2+8wVzSODELBgXDsGpDv12X10sEg0sakYwwAx/AiLmW+RVBr5fiPLu8dXnC0UADu4GuOEJPNroGQy8aEywmpfStqQe8MAL +HMGFOhHc/RKgsvT+q/+pEMYll1g8GVobyYGBX4fGcPHVJ4w9YEC/ZFZCjLaUl76ZMnkppyZiB3XwgrMv4MJz/ReIIczGRaUY3CcotDp8gA8tuHhwpS/oLp+6 +4Od3f3iwmfrW681Hwu1781V9fl63Q9EPjIUEMfn7xluxanuZ5rFxk0IbPBtP4tU0vfs0PQOe+YJn3wjBfzXf2/VOGIaFQ4H3bv4V9Hg7FOyT4/+adfUbsHKu +FiHB+GqZvkskLuHx+efb9uADfA6geXXyq02X2Jab2yhkSimOgjpKGPzXCd8m2WgpndFpCLCOqhgew7z2Wrt7kG/baqH2pEoMKB9iAgDCWBuqwn3I044ykjbk +ARMN4H+VlVd9rImDzFtGaDU25HOlfFd9OgtWdYNbX8AguY9h9IMLoUxpRvnd/0gwsrPMtFN1goFgUQJvhzYr+NSNCcEttRquPb/Ab33CNGA3YUw4qm3PWzwb +fLcnvvo/JbZdcFdbwTMvNiDD6zT7m3G+OqvgSF0wgFnfs1z8xrQErpjUfc8xN8HTpv8JU9BXBFef6pk+sFJwBGdWkNRUCNzBhzoIq/us1vzRJ/BTtMY5eMlz +OGOlKSqCCy7/JXCpp/Bb7buP7nhP2631ace44MFvEqXcNcqjtfxoUHymrp5xD77hATzacb+inS2/B49HHTMsXSwtC9yCCw7wVeEP/2nDf7SG926rrb7mcDc0 +4iFgPERgGTA6pIl4FQC+VQZByvrvG4IQr5OYkMCHVFfVj+i0rTpoV/Vr338XhtNuLUb1G4Ehk2aCBB1Wj7Y9R0B5wOG+OTH9UC94qk7PJcykrRI0z13ul9DO +F+4G66MOhIFURGExivAVefJcfe5jTHWBwz3PME072dclrIjDQ1r3FTwUlIuVgEMwqadHuEsiqPJ+JFwsFvwfMQaCF220Fs0BMK31Y7nhB559gwHsLT3bvoEF +zjz3rC448Uy92tamRcuY1LyPe5Ln6IWe6nG/mHORsDivB13V88wz7Qm7FBTlpD7aHEP6TTirHThSl377DacEj6Jp2+2S9+XBN7ajwEsKV+CNd4EnDFHkoaBf +CqsGl6mUpj3Dr+AmGHDnm8UDr9+sqnqmxDOKcnJYGuxSSkM+sINRPWBjcCigbssNXmE4ZHoIuZLKJAhEBKb7zXDhuG18ZkuIIBTxdVo5k59WP2gIAUFQHcfo +6vdt8hRAGAjDuiefqBjg6h6YdEA+SPWMVdCuMmDEXHWaTb65LRZOQgjNGD8Crna+RNvylyBrH4zq0T/t05SW2NBS+qM9bdNKtL4VASwDl0Zd704N5WHcEC6D +PrsH3kJ6Ma9+qM99DKdNdeuHvrpX/cQY8AlOV/U13+QRFhwsU95pXwz8kZ7mhtr+6Y8+lNXEkOBx3zfrVX2XR9/NixBu6/MIIhj11wUeZSkSSd2EtPCGgQpm +bVGGnvEy4IK7rG/6AvfabsdGcdJPKDX9t1IdHOpmaco6KO+CF3BpiwDAo9/cWZFPv7WtbrB7rm7tqNOYkxDpp2dgkR/c6tdP+cDggmvf8IXO2oYrl7bafrWv +voQTefWxZ9CMIP9/0Wj1ClQSO/4AAAAASUVORK5CYII=` diff --git a/server/llm_utils_test.go b/server/llm_utils_test.go index 592fac25..9560bcab 100644 --- a/server/llm_utils_test.go +++ b/server/llm_utils_test.go @@ -7,7 +7,6 @@ import ( "path" "runtime" "testing" - "time" "github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/llm" @@ -39,7 +38,6 @@ func PrepareModelForPrompts(t *testing.T, modelName string, opts api.Options) (* } func OneShotPromptResponse(t *testing.T, ctx context.Context, req api.GenerateRequest, model *Model, runner llm.LLM) string { - checkpointStart := time.Now() prompt, err := model.Prompt(PromptVars{ System: req.System, Prompt: req.Prompt, @@ -56,12 +54,10 @@ func OneShotPromptResponse(t *testing.T, ctx context.Context, req api.GenerateRe success <- true } } - checkpointLoaded := time.Now() predictReq := llm.PredictOpts{ - Prompt: prompt, - Format: req.Format, - CheckpointStart: checkpointStart, - CheckpointLoaded: checkpointLoaded, + Prompt: prompt, + Format: req.Format, + Images: req.Images, } err = runner.Predict(ctx, predictReq, cb) require.NoError(t, err, "predict call failed") From 1b991d0ba961936ec8bb50c5b8dabdcd2f9aff25 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 13 Dec 2023 17:26:47 -0800 Subject: [PATCH 09/19] Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version --- Dockerfile.cpu | 35 ++++++++++++++++++++++++++ README.md | 29 +--------------------- docs/development.md | 35 +++++++++++++++++++++++--- gpu/gpu.go | 33 ++++++++++++++++-------- gpu/gpu_info_cuda.c | 24 ++++++------------ gpu/gpu_info_rocm.c | 17 +++++++------ llm/ext_server.go | 12 +-------- llm/llama.cpp/gen_linux.sh | 51 ++++++++++++++++++++++++-------------- scripts/build_linux.sh | 14 ++++++++--- 9 files changed, 152 insertions(+), 98 deletions(-) create mode 100644 Dockerfile.cpu diff --git a/Dockerfile.cpu b/Dockerfile.cpu new file mode 100644 index 00000000..dd6926df --- /dev/null +++ b/Dockerfile.cpu @@ -0,0 +1,35 @@ +# Dockerfile variant to ensure we can build CPU only on linux +FROM --platform=linux/amd64 ubuntu:20.04 AS base-cpu-amd64 +ENV CMAKE_ARCH "x86_64" + +FROM --platform=linux/arm64 ubuntu:20.04 AS base-cpu-arm64 +ENV CMAKE_ARCH "aarch64" + +FROM base-cpu-${TARGETARCH} AS cpu-builder +ARG TARGETARCH +ARG GOFLAGS +ARG CGO_CFLAGS + +# Common toolchain +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y wget make gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10 +RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-${CMAKE_ARCH}.sh" -O /tmp/cmake-installer.sh && \ + chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr + +# install go +ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz +RUN mkdir -p /usr/local && tar xz -C /usr/local err = NULL; const int buflen = 256; char buf[buflen + 1]; @@ -56,6 +57,13 @@ void cuda_init(cuda_init_resp_t *resp) { return; } } + + ret = (*resp->ch.initFn)(); + if (ret != NVML_SUCCESS) { + snprintf(buf, buflen, "nvml vram init failure: %d", ret); + resp->err = strdup(buf); + } + return; } @@ -73,17 +81,9 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { return; } - ret = (*h.initFn)(); - if (ret != NVML_SUCCESS) { - snprintf(buf, buflen, "nvml vram init failure: %d", ret); - resp->err = strdup(buf); - return; - } - // TODO - handle multiple GPUs ret = (*h.getHandle)(0, &device); if (ret != NVML_SUCCESS) { - (*h.shutdownFn)(); snprintf(buf, buflen, "unable to get device handle: %d", ret); resp->err = strdup(buf); return; @@ -91,20 +91,12 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { ret = (*h.getMemInfo)(device, &memInfo); if (ret != NVML_SUCCESS) { - (*h.shutdownFn)(); snprintf(buf, buflen, "device memory info lookup failure: %d", ret); resp->err = strdup(buf); return; } resp->total = memInfo.total; resp->free = memInfo.free; - - ret = (*h.shutdownFn)(); - if (ret != NVML_SUCCESS) { - snprintf(buf, buflen, "nvml vram shutdown failure: %d", ret); - resp->err = strdup(buf); - } - return; } #endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c index 88bd2dad..e69d5cba 100644 --- a/gpu/gpu_info_rocm.c +++ b/gpu/gpu_info_rocm.c @@ -20,6 +20,7 @@ const char *rocm_lib_paths[] = { #endif void rocm_init(rocm_init_resp_t *resp) { + rsmi_status_t ret; resp->err = NULL; const int buflen = 256; char buf[buflen + 1]; @@ -56,6 +57,13 @@ void rocm_init(rocm_init_resp_t *resp) { return; } } + + ret = (*resp->rh.initFn)(0); + if (ret != RSMI_STATUS_SUCCESS) { + snprintf(buf, buflen, "rocm vram init failure: %d", ret); + resp->err = strdup(buf); + } + return; } @@ -70,10 +78,8 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { char buf[buflen + 1]; int i; - ret = (*h.initFn)(0); - if (ret != RSMI_STATUS_SUCCESS) { - snprintf(buf, buflen, "rocm vram init failure: %d", ret); - resp->err = strdup(buf); + if (h.handle == NULL) { + resp->err = strdup("nvml handle sn't initialized"); return; } @@ -89,20 +95,17 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { // Get total memory - used memory for available memory ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem); if (ret != RSMI_STATUS_SUCCESS) { - (*h.shutdownFn)(); snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret); resp->err = strdup(buf); return; } ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem); if (ret != RSMI_STATUS_SUCCESS) { - (*h.shutdownFn)(); snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret); resp->err = strdup(buf); return; } - (*h.shutdownFn)(); resp->total = totalMem; resp->free = totalMem - usedMem; return; diff --git a/llm/ext_server.go b/llm/ext_server.go index ded424a9..5fcd8e92 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -21,17 +21,7 @@ package llm #cgo linux CFLAGS: -D_GNU_SOURCE #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/examples/server/libext_server.a -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a - -// Note: the following requires cuda library presence on linux to build, even if you only have rocm or CPU only -#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a -#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a -#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a -#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudadevrt.a -#cgo linux LDFLAGS: /usr/local/cuda/lib64/libculibos.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libollama.a #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm #cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin #cgo windows LDFLAGS: -lext_server_shared -lpthread diff --git a/llm/llama.cpp/gen_linux.sh b/llm/llama.cpp/gen_linux.sh index 93c998f4..3608ddd6 100755 --- a/llm/llama.cpp/gen_linux.sh +++ b/llm/llama.cpp/gen_linux.sh @@ -13,28 +13,43 @@ source $(dirname $0)/gen_common.sh init_vars git_module_setup apply_patches -CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" +if [ -d /usr/local/cuda/lib64/ ] ; then + CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" +else + CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" +fi BUILD_DIR="gguf/build/cuda" LIB_DIR="${BUILD_DIR}/lib" mkdir -p ../../dist/ build -# TODO - explore mechanism to soften the hard cuda dependency on linux -# by conditionally building some archive here that aggregates the cuda libs if present -# so that the cgo flags link this intermediate archive instead of the underlying cuda libs -# -# gcc -fPIC -g -shared -o ${LIB_DIR}/libcuda_server.so \ -# -Wl,--whole-archive \ -# ${BUILD_DIR}/examples/server/CMakeFiles/ext_server.dir/server.cpp.o \ -# ${BUILD_DIR}/common/libcommon.a \ -# ${BUILD_DIR}/libllama.a \ -# ${BUILD_DIR}/examples/llava/libllava_static.a \ -# -Wl,--no-whole-archive \ -# -lrt -lpthread -ldl -lstdc++ -lm \ -# /usr/local/cuda/lib64/libcudart_static.a \ -# /usr/local/cuda/lib64/libcublas_static.a \ -# /usr/local/cuda/lib64/libcublasLt_static.a \ -# /usr/local/cuda/lib64/libcudadevrt.a \ -# /usr/local/cuda/lib64/libculibos.a + +if [ -d /usr/local/cuda/lib64/ ] ; then + pwd + ar -M < Date: Fri, 15 Dec 2023 14:27:27 -0800 Subject: [PATCH 10/19] Refine handling of shim presence This allows the CPU only builds to work on systems with Radeon cards --- llm/llm.go | 10 ++++++---- llm/shim_ext_server.go | 5 ++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/llm/llm.go b/llm/llm.go index 86dd3346..69ea705f 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -22,6 +22,9 @@ type LLM interface { Close() } +// Set to false on linux/windows if we are able to load the shim +var ShimPresent = false + func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) { if _, err := os.Stat(model); err != nil { return nil, err @@ -79,11 +82,10 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) opts.RopeFrequencyBase = 0.0 opts.RopeFrequencyScale = 0.0 gpuInfo := gpu.GetGPUInfo() - switch gpuInfo.Driver { - case "ROCM": + if gpuInfo.Driver == "ROCM" && ShimPresent { return newRocmShimExtServer(model, adapters, projectors, ggml.NumLayers(), opts) - default: - // Rely on the built-in CUDA based server which will fall back to CPU + } else { + // Rely on the built-in CUDA/Metal based server which will fall back to CPU return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts) } } diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go index 0e7bcfae..7505adaa 100644 --- a/llm/shim_ext_server.go +++ b/llm/shim_ext_server.go @@ -30,7 +30,6 @@ import ( var libEmbed embed.FS var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported") -var NoShim = true type shimExtServer struct { s C.struct_rocm_llama_server @@ -78,7 +77,7 @@ func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) { } func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { - if NoShim { + if !ShimPresent { return nil, RocmShimMissing } log.Printf("Loading ROCM llm server") @@ -207,6 +206,6 @@ func extractLib(workDir string) error { case err != nil: return fmt.Errorf("stat ROCm shim %s: %v", files[0], err) } - NoShim = false + ShimPresent = true return nil } From 5646826a792ce07bbf4989210841d26b86f0de44 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 15 Dec 2023 20:16:02 -0800 Subject: [PATCH 11/19] Add WSL2 path to nvidia-ml.so library --- gpu/gpu_info_cuda.c | 1 + 1 file changed, 1 insertion(+) diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c index 710daa89..30aba037 100644 --- a/gpu/gpu_info_cuda.c +++ b/gpu/gpu_info_cuda.c @@ -8,6 +8,7 @@ const char *cuda_lib_paths[] = { "libnvidia-ml.so", "/usr/local/cuda/lib64/libnvidia-ml.so", + "/usr/lib/wsl/lib/libnvidia-ml.so.1", // TODO Maybe glob? NULL, }; #else From 54dbfa4c4a2c52dc0c2361e65090a0ede3339a63 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 18 Dec 2023 18:32:04 -0800 Subject: [PATCH 12/19] Carry ggml-metal.metal as payload --- llm/llama.go | 44 +++++++++++++++++++++++++++++++++++++ llm/shim_darwin.go | 18 +++++++++++++-- llm/shim_ext_server.go | 50 +++++------------------------------------- 3 files changed, 65 insertions(+), 47 deletions(-) diff --git a/llm/llama.go b/llm/llama.go index 26a0d588..ec067194 100644 --- a/llm/llama.go +++ b/llm/llama.go @@ -6,8 +6,12 @@ import ( _ "embed" "errors" "fmt" + "io" + "io/fs" + "log" "os" "os/exec" + "path/filepath" "sync" "time" @@ -116,6 +120,7 @@ type ImageData struct { var ( errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed") errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only") + payloadMissing = fmt.Errorf("expected payload not included in this build of ollama") ) // StatusWriter is a writer that captures error messages from the llama runner process @@ -202,3 +207,42 @@ type EmbeddingRequest struct { type EmbeddingResponse struct { Embedding []float64 `json:"embedding"` } + +func extractLib(workDir, glob string) error { + files, err := fs.Glob(libEmbed, glob) + if err != nil || len(files) == 0 { + return payloadMissing + } + + if len(files) != 1 { + // Shouldn't happen, but just use the first one we find + log.Printf("WARNING: multiple payloads detected - using %s", files[0]) + } + + srcFile, err := libEmbed.Open(files[0]) + if err != nil { + return fmt.Errorf("read payload %s: %v", files[0], err) + } + defer srcFile.Close() + if err := os.MkdirAll(workDir, 0o755); err != nil { + return fmt.Errorf("create payload temp dir %s: %v", workDir, err) + } + + destFile := filepath.Join(workDir, filepath.Base(files[0])) + + _, err = os.Stat(destFile) + switch { + case errors.Is(err, os.ErrNotExist): + destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) + if err != nil { + return fmt.Errorf("write payload %s: %v", files[0], err) + } + defer destFile.Close() + if _, err := io.Copy(destFile, srcFile); err != nil { + return fmt.Errorf("copy payload %s: %v", files[0], err) + } + case err != nil: + return fmt.Errorf("stat payload %s: %v", files[0], err) + } + return nil +} diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go index adf02108..3cefe4c5 100644 --- a/llm/shim_darwin.go +++ b/llm/shim_darwin.go @@ -1,18 +1,32 @@ package llm import ( + "embed" "fmt" + "log" + "os" "github.com/jmorganca/ollama/api" ) -// no-op stubs for mac +//go:embed llama.cpp/gguf/build/*/bin/ggml-metal.metal +var libEmbed embed.FS func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { // should never happen... return nil, fmt.Errorf("ROCM GPUs not supported on Mac") } -func nativeInit(workDir string) error { +func nativeInit(workdir string) error { + err := extractLib(workdir, "llama.cpp/gguf/build/*/bin/ggml-metal.metal") + if err != nil { + if err == payloadMissing { + // TODO perhaps consider this a hard failure on arm macs? + log.Printf("ggml-meta.metal payload missing") + return nil + } + return err + } + os.Setenv("GGML_METAL_PATH_RESOURCES", workdir) return nil } diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go index 7505adaa..fa841d49 100644 --- a/llm/shim_ext_server.go +++ b/llm/shim_ext_server.go @@ -14,7 +14,6 @@ import ( "embed" "errors" "fmt" - "io" "io/fs" "log" "os" @@ -109,13 +108,15 @@ func (llm *shimExtServer) Close() { } func nativeInit(workdir string) error { - err := extractLib(workdir) + err := extractLib(workdir, "llama.cpp/gguf/build/*/lib/*rocm_server*") if err != nil { - if err == RocmShimMissing { - log.Printf("%s", err) + if err == payloadMissing { + log.Printf("%s", RocmShimMissing) return nil } return err + } else { + ShimPresent = true } // Verify we have permissions - either running as root, or we have group access to the driver @@ -168,44 +169,3 @@ func nativeInit(workdir string) error { } return nil } - -func extractLib(workDir string) error { - files, err := fs.Glob(libEmbed, "llama.cpp/gguf/build/*/lib/*rocm_server*") - if err != nil || len(files) == 0 { - // this is expected, ollama may be compiled without shim library packed in - return RocmShimMissing - } - - if len(files) != 1 { - // Shouldn't happen, but just use the first one we find - log.Printf("WARNING: multiple rocm libraries detected - using %s", files[0]) - } - - srcFile, err := libEmbed.Open(files[0]) - if err != nil { - return fmt.Errorf("read ROCm shim %s: %v", files[0], err) - } - defer srcFile.Close() - if err := os.MkdirAll(workDir, 0o755); err != nil { - return fmt.Errorf("create ROCm shim temp dir %s: %v", workDir, err) - } - - destFile := filepath.Join(workDir, filepath.Base(files[0])) - - _, err = os.Stat(destFile) - switch { - case errors.Is(err, os.ErrNotExist): - destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) - if err != nil { - return fmt.Errorf("write ROCm shim %s: %v", files[0], err) - } - defer destFile.Close() - if _, err := io.Copy(destFile, srcFile); err != nil { - return fmt.Errorf("copy ROCm shim %s: %v", files[0], err) - } - case err != nil: - return fmt.Errorf("stat ROCm shim %s: %v", files[0], err) - } - ShimPresent = true - return nil -} From 6558f94ed022a0d8ef0c06afa2df58fb1c298676 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 19 Dec 2023 13:32:24 -0800 Subject: [PATCH 13/19] Fix darwin intel build --- gpu/gpu_darwin.go | 10 ++++++++-- llm/ext_server.go | 16 ++++++---------- llm/llama.cpp/gen_darwin.sh | 17 +++++------------ llm/shim_darwin.go | 4 ++-- 4 files changed, 21 insertions(+), 26 deletions(-) diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index e4a9456a..14bd2655 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -4,6 +4,8 @@ package gpu import "C" import ( + "runtime" + "github.com/jmorganca/ollama/api" ) @@ -25,8 +27,12 @@ func GetGPUInfo() GpuInfo { } func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { - // default to enable metal on macOS - return 1 + if runtime.GOARCH == "arm64" { + return 1 + } + + // metal only supported on arm64 + return 0 } func nativeInit() error { diff --git a/llm/ext_server.go b/llm/ext_server.go index 5fcd8e92..ab74eb00 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -7,17 +7,13 @@ package llm #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable #cgo darwin CFLAGS: -D_DARWIN_C_SOURCE #cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE -#cgo darwin,arm64 CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG +#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG #cgo darwin LDFLAGS: -lc++ -framework Accelerate -#cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders -#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/common/libcommon.a -#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/examples/server/libext_server.a -#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libllama.a -#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libggml_static.a -#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a -#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a -#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a -#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a +#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders +#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/common/libcommon.a +#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/examples/server/libext_server.a +#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libllama.a +#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libggml_static.a #cgo linux CFLAGS: -D_GNU_SOURCE #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs diff --git a/llm/llama.cpp/gen_darwin.sh b/llm/llama.cpp/gen_darwin.sh index f159ceff..1364e9d1 100755 --- a/llm/llama.cpp/gen_darwin.sh +++ b/llm/llama.cpp/gen_darwin.sh @@ -9,15 +9,14 @@ set -o pipefail echo "Starting darwin generate script" source $(dirname $0)/gen_common.sh init_vars -CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 ${CMAKE_DEFS}" +CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on ${CMAKE_DEFS}" +BUILD_DIR="gguf/build/metal" case "${GOARCH}" in "amd64") - CMAKE_DEFS="-DLLAMA_METAL=off -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 ${CMAKE_DEFS}" - BUILD_DIR="gguf/build/cpu" + CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 ${CMAKE_DEFS}" ;; "arm64") - CMAKE_DEFS="-DLLAMA_METAL=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 ${CMAKE_DEFS}" - BUILD_DIR="gguf/build/metal" + CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 ${CMAKE_DEFS}" ;; *) echo "GOARCH must be set" @@ -28,10 +27,4 @@ esac git_module_setup apply_patches -build - -# TODO - improve this to handle test cases that need it to be in "." around the tree -# Enable local debug/run usecase -if [ -e "gguf/ggml-metal.metal" ]; then - cp gguf/ggml-metal.metal ../../ -fi +build \ No newline at end of file diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go index 3cefe4c5..f63ce8c8 100644 --- a/llm/shim_darwin.go +++ b/llm/shim_darwin.go @@ -9,7 +9,7 @@ import ( "github.com/jmorganca/ollama/api" ) -//go:embed llama.cpp/gguf/build/*/bin/ggml-metal.metal +//go:embed llama.cpp/gguf/ggml-metal.metal var libEmbed embed.FS func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { @@ -18,7 +18,7 @@ func newRocmShimExtServer(model string, adapters, projectors []string, numLayers } func nativeInit(workdir string) error { - err := extractLib(workdir, "llama.cpp/gguf/build/*/bin/ggml-metal.metal") + err := extractLib(workdir, "llama.cpp/gguf/ggml-metal.metal") if err != nil { if err == payloadMissing { // TODO perhaps consider this a hard failure on arm macs? From 1d1eb1688cf46c4b9aa599047d98ffc4d723b692 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 19 Dec 2023 15:52:34 -0800 Subject: [PATCH 14/19] Additional nvidial-ml path to check --- gpu/gpu_info_cuda.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c index 30aba037..20055ed6 100644 --- a/gpu/gpu_info_cuda.c +++ b/gpu/gpu_info_cuda.c @@ -8,6 +8,7 @@ const char *cuda_lib_paths[] = { "libnvidia-ml.so", "/usr/local/cuda/lib64/libnvidia-ml.so", + "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so", "/usr/lib/wsl/lib/libnvidia-ml.so.1", // TODO Maybe glob? NULL, }; @@ -40,6 +41,8 @@ void cuda_init(cuda_init_resp_t *resp) { resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY); } if (!resp->ch.handle) { + // TODO improve error message, as the LOAD_ERR will have typically have the + // final path that was checked which might be confusing. snprintf(buf, buflen, "Unable to load %s library to query for Nvidia GPUs: %s", cuda_lib_paths[0], LOAD_ERR()); From 7555ea44f81843a65d373e2bd20936adaea67c28 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 20 Dec 2023 10:36:01 -0800 Subject: [PATCH 15/19] Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. --- Dockerfile.build | 2 +- gpu/gpu.go | 6 +- gpu/gpu_darwin.go | 1 + gpu/types.go | 1 + llm/{rocm_shim.c => dynamic_shim.c} | 70 ++++++++-------- llm/dynamic_shim.h | 74 +++++++++++++++++ llm/ext_server.go | 7 +- llm/llama.cpp/gen_common.sh | 10 +-- llm/llama.cpp/gen_linux.sh | 98 +++++++++++----------- llm/llama.go | 60 +++++++------- llm/llm.go | 23 ++++-- llm/rocm_shim.h | 73 ----------------- llm/shim_darwin.go | 6 +- llm/shim_ext_server.go | 121 ++++++++++++---------------- 14 files changed, 272 insertions(+), 280 deletions(-) rename llm/{rocm_shim.c => dynamic_shim.c} (55%) create mode 100644 llm/dynamic_shim.h delete mode 100644 llm/rocm_shim.h diff --git a/Dockerfile.build b/Dockerfile.build index 5499b0a1..6b7e3c4d 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -3,7 +3,7 @@ FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64 ARG CUDA_VERSION=11.3.1-1 ARG CMAKE_VERSION=3.22.1 # ROCm only supports amd64 -ARG ROCM_VERSION=5.7 +ARG ROCM_VERSION=6.0 # Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html RUN apt-get update && \ diff --git a/gpu/gpu.go b/gpu/gpu.go index d03812c8..91ced3a8 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -65,7 +65,7 @@ func GetGPUInfo() GpuInfo { } var memInfo C.mem_info_t - resp := GpuInfo{"", 0, 0} + resp := GpuInfo{"", "", 0, 0} if gpuHandles.cuda != nil { C.cuda_check_vram(*gpuHandles.cuda, &memInfo) if memInfo.err != nil { @@ -73,6 +73,7 @@ func GetGPUInfo() GpuInfo { C.free(unsafe.Pointer(memInfo.err)) } else { resp.Driver = "CUDA" + resp.Library = "cuda_server" } } else if gpuHandles.rocm != nil { C.rocm_check_vram(*gpuHandles.rocm, &memInfo) @@ -81,11 +82,14 @@ func GetGPUInfo() GpuInfo { C.free(unsafe.Pointer(memInfo.err)) } else { resp.Driver = "ROCM" + resp.Library = "rocm_server" } } if resp.Driver == "" { C.cpu_check_ram(&memInfo) resp.Driver = "CPU" + // In the future we may offer multiple CPU variants to tune CPU features + resp.Library = "default" } if memInfo.err != nil { log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err)) diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 14bd2655..ccf67b51 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -21,6 +21,7 @@ func GetGPUInfo() GpuInfo { return GpuInfo{ Driver: "METAL", + Library: "default", TotalMemory: 0, FreeMemory: 0, } diff --git a/gpu/types.go b/gpu/types.go index a84a0a8d..a56da45e 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -3,6 +3,7 @@ package gpu // Beginning of an `ollama info` command type GpuInfo struct { Driver string `json:"driver,omitempty"` + Library string `json:"library,omitempty"` TotalMemory uint64 `json:"total_memory,omitempty"` FreeMemory uint64 `json:"free_memory,omitempty"` diff --git a/llm/rocm_shim.c b/llm/dynamic_shim.c similarity index 55% rename from llm/rocm_shim.c rename to llm/dynamic_shim.c index e8304aa0..8b5d67c9 100644 --- a/llm/rocm_shim.c +++ b/llm/dynamic_shim.c @@ -1,4 +1,4 @@ -#include "rocm_shim.h" +#include "dynamic_shim.h" #include #include @@ -28,8 +28,8 @@ inline static char *LOAD_ERR() { #define UNLOAD_LIBRARY(handle) dlclose(handle) #endif -void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, - ext_server_resp_t *err) { +void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s, + ext_server_resp_t *err) { int i = 0; struct lookup { char *s; @@ -57,11 +57,8 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, s->handle = LOAD_LIBRARY(libPath, RTLD_NOW); if (!s->handle) { err->id = -1; - snprintf( - err->msg, err->msg_len, - "Unable to load rocm server library: %s (If you have a Radeon card, " - "did you install the ROCM libraries?)", - LOAD_ERR()); + snprintf(err->msg, err->msg_len, + "Unable to load dynamic server library: %s", LOAD_ERR()); return; } @@ -77,64 +74,63 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, } } -inline void rocm_shim_llama_server_init(struct rocm_llama_server s, - ext_server_params_t *sparams, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s, + ext_server_params_t *sparams, + ext_server_resp_t *err) { s.llama_server_init(sparams, err); } -inline void rocm_shim_llama_server_start(struct rocm_llama_server s) { +inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) { s.llama_server_start(); } -inline void rocm_shim_llama_server_stop(struct rocm_llama_server s) { +inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) { s.llama_server_stop(); } -inline void rocm_shim_llama_server_completion(struct rocm_llama_server s, - const char *json_req, - ext_server_resp_t *resp) { +inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s, + const char *json_req, + ext_server_resp_t *resp) { s.llama_server_completion(json_req, resp); } -inline void rocm_shim_llama_server_completion_next_result( - struct rocm_llama_server s, const int task_id, +inline void dynamic_shim_llama_server_completion_next_result( + struct dynamic_llama_server s, const int task_id, ext_server_task_result_t *result) { s.llama_server_completion_next_result(task_id, result); } -inline void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s, - const int task_id, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_completion_cancel( + struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) { s.llama_server_completion_cancel(task_id, err); } -inline void rocm_shim_llama_server_release_task_result( - struct rocm_llama_server s, ext_server_task_result_t *result) { +inline void dynamic_shim_llama_server_release_task_result( + struct dynamic_llama_server s, ext_server_task_result_t *result) { s.llama_server_release_task_result(result); } -inline void rocm_shim_llama_server_tokenize(struct rocm_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { s.llama_server_tokenize(json_req, json_resp, err); } -inline void rocm_shim_llama_server_detokenize(struct rocm_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { s.llama_server_detokenize(json_req, json_resp, err); } -inline void rocm_shim_llama_server_embedding(struct rocm_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { s.llama_server_embedding(json_req, json_resp, err); } -inline void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s, - char **json_resp) { +inline void dynamic_shim_llama_server_release_json_resp( + struct dynamic_llama_server s, char **json_resp) { s.llama_server_release_json_resp(json_resp); } diff --git a/llm/dynamic_shim.h b/llm/dynamic_shim.h new file mode 100644 index 00000000..5e4e78b7 --- /dev/null +++ b/llm/dynamic_shim.h @@ -0,0 +1,74 @@ +#include + +#include "server.h" + +#ifdef __cplusplus +extern "C" { +#endif +struct dynamic_llama_server { + void *handle; + void (*llama_server_init)(ext_server_params_t *sparams, + ext_server_resp_t *err); + void (*llama_server_start)(); + void (*llama_server_stop)(); + void (*llama_server_completion)(const char *json_req, + ext_server_resp_t *resp); + void (*llama_server_completion_next_result)(const int task_id, + ext_server_task_result_t *result); + void (*llama_server_completion_cancel)(const int task_id, + ext_server_resp_t *err); + void (*llama_server_release_task_result)(ext_server_task_result_t *result); + void (*llama_server_tokenize)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_detokenize)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_embedding)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_release_json_resp)(char **json_resp); +}; + +void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s, + ext_server_resp_t *err); + +// No good way to call C function pointers from Go so inline the indirection +void dynamic_shim_llama_server_init(struct dynamic_llama_server s, + ext_server_params_t *sparams, + ext_server_resp_t *err); + +void dynamic_shim_llama_server_start(struct dynamic_llama_server s); + +void dynamic_shim_llama_server_stop(struct dynamic_llama_server s); + +void dynamic_shim_llama_server_completion(struct dynamic_llama_server s, + const char *json_req, + ext_server_resp_t *resp); + +void dynamic_shim_llama_server_completion_next_result( + struct dynamic_llama_server s, const int task_id, + ext_server_task_result_t *result); + +void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s, + const int task_id, + ext_server_resp_t *err); + +void dynamic_shim_llama_server_release_task_result( + struct dynamic_llama_server s, ext_server_task_result_t *result); + +void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s, + const char *json_req, char **json_resp, + ext_server_resp_t *err); + +void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err); + +void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s, + const char *json_req, char **json_resp, + ext_server_resp_t *err); +void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s, + char **json_resp); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/llm/ext_server.go b/llm/ext_server.go index ab74eb00..048b1a65 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -17,7 +17,10 @@ package llm #cgo linux CFLAGS: -D_GNU_SOURCE #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libollama.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm #cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin #cgo windows LDFLAGS: -lext_server_shared -lpthread @@ -121,7 +124,7 @@ func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) { C.llama_server_release_json_resp(json_resp) } -func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { +func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { server := &llamaExtServer{opts} return newExtServer(server, model, adapters, projectors, numLayers, opts) } diff --git a/llm/llama.cpp/gen_common.sh b/llm/llama.cpp/gen_common.sh index 83a21cf9..c6b84f7d 100644 --- a/llm/llama.cpp/gen_common.sh +++ b/llm/llama.cpp/gen_common.sh @@ -6,7 +6,7 @@ init_vars() { CMAKE_DEFS="-DLLAMA_ACCELERATE=on" # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static" - if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then + if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}" else # TODO - add additional optimization flags... @@ -15,7 +15,7 @@ init_vars() { } git_module_setup() { - if [ -n "${OLLAMA_SKIP_PATCHING}" ] ; then + if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then echo "Skipping submodule initialization" return fi @@ -25,13 +25,13 @@ git_module_setup() { } apply_patches() { - if [ -n "${OLLAMA_SKIP_PATCHING}" ] ; then + if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then echo "Skipping submodule patching" return fi # Workaround git apply not handling creation well for iteration rm -f gguf/examples/server/server.h - for patch in ${PATCHES} ; do + for patch in ${PATCHES}; do git -C gguf apply ../patches/${patch} done } @@ -39,4 +39,4 @@ apply_patches() { build() { cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS} cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 -} \ No newline at end of file +} diff --git a/llm/llama.cpp/gen_linux.sh b/llm/llama.cpp/gen_linux.sh index 3608ddd6..e3cb87a8 100755 --- a/llm/llama.cpp/gen_linux.sh +++ b/llm/llama.cpp/gen_linux.sh @@ -1,81 +1,81 @@ #!/bin/bash # This script is intended to run inside the go generate -# working directory must be ../llm/llama.cpp +# working directory must be llm/llama.cpp + +# First we build our default built-in library which will be linked into the CGO +# binary as a normal dependency. This default build is CPU based. +# +# Then we build a CUDA dynamic library (although statically linked with the CUDA +# library dependencies for maximum portability) +# +# Then if we detect ROCm, we build a dynamically loaded ROCm lib. ROCm is particularly +# important to be a dynamic lib even if it's the only GPU library detected because +# we can't redistribute the objectfiles but must rely on dynamic libraries at +# runtime, which could lead the server not to start if not present. set -ex set -o pipefail echo "Starting linux generate script" -if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ] ; then +if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then export CUDACXX=/usr/local/cuda/bin/nvcc fi +COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" +OLLAMA_DYN_LIB_DIR="gguf/build/lib" +mkdir -p ${OLLAMA_DYN_LIB_DIR} +touch ${OLLAMA_DYN_LIB_DIR}/.generated source $(dirname $0)/gen_common.sh init_vars git_module_setup apply_patches -if [ -d /usr/local/cuda/lib64/ ] ; then - CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" -else - CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" -fi -BUILD_DIR="gguf/build/cuda" -LIB_DIR="${BUILD_DIR}/lib" -mkdir -p ../../dist/ + +# +# CPU first for the default library +# +CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" +BUILD_DIR="gguf/build/cpu" build -if [ -d /usr/local/cuda/lib64/ ] ; then - pwd - ar -M < - -#include "server.h" - -#ifdef __cplusplus -extern "C" { -#endif -struct rocm_llama_server { - void *handle; - void (*llama_server_init)(ext_server_params_t *sparams, - ext_server_resp_t *err); - void (*llama_server_start)(); - void (*llama_server_stop)(); - void (*llama_server_completion)(const char *json_req, - ext_server_resp_t *resp); - void (*llama_server_completion_next_result)(const int task_id, - ext_server_task_result_t *result); - void (*llama_server_completion_cancel)(const int task_id, - ext_server_resp_t *err); - void (*llama_server_release_task_result)(ext_server_task_result_t *result); - void (*llama_server_tokenize)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_detokenize)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_embedding)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_release_json_resp)(char **json_resp); -}; - -void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, - ext_server_resp_t *err); - -// No good way to call C function pointers from Go so inline the indirection -void rocm_shim_llama_server_init(struct rocm_llama_server s, - ext_server_params_t *sparams, - ext_server_resp_t *err); - -void rocm_shim_llama_server_start(struct rocm_llama_server s); - -void rocm_shim_llama_server_stop(struct rocm_llama_server s); - -void rocm_shim_llama_server_completion(struct rocm_llama_server s, - const char *json_req, - ext_server_resp_t *resp); - -void rocm_shim_llama_server_completion_next_result( - struct rocm_llama_server s, const int task_id, - ext_server_task_result_t *result); - -void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s, - const int task_id, - ext_server_resp_t *err); - -void rocm_shim_llama_server_release_task_result( - struct rocm_llama_server s, ext_server_task_result_t *result); - -void rocm_shim_llama_server_tokenize(struct rocm_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); - -void rocm_shim_llama_server_detokenize(struct rocm_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); - -void rocm_shim_llama_server_embedding(struct rocm_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); -void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s, - char **json_resp); - -#ifdef __cplusplus -} -#endif \ No newline at end of file diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go index f63ce8c8..98e7a7d5 100644 --- a/llm/shim_darwin.go +++ b/llm/shim_darwin.go @@ -12,13 +12,13 @@ import ( //go:embed llama.cpp/gguf/ggml-metal.metal var libEmbed embed.FS -func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { +func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { // should never happen... - return nil, fmt.Errorf("ROCM GPUs not supported on Mac") + return nil, fmt.Errorf("Dynamic library loading not supported on Mac") } func nativeInit(workdir string) error { - err := extractLib(workdir, "llama.cpp/gguf/ggml-metal.metal") + _, err := extractDynamicLibs(workdir, "llama.cpp/gguf/ggml-metal.metal") if err != nil { if err == payloadMissing { // TODO perhaps consider this a hard failure on arm macs? diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go index fa841d49..d9c2df46 100644 --- a/llm/shim_ext_server.go +++ b/llm/shim_ext_server.go @@ -5,7 +5,7 @@ package llm /* #include -#include "rocm_shim.h" +#include "dynamic_shim.h" */ import "C" @@ -18,20 +18,20 @@ import ( "log" "os" "path/filepath" - "runtime" + "strings" "sync" "unsafe" "github.com/jmorganca/ollama/api" ) -//go:embed llama.cpp/gguf/build/*/lib/* +//go:embed llama.cpp/gguf/build/lib/* var libEmbed embed.FS var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported") type shimExtServer struct { - s C.struct_rocm_llama_server + s C.struct_dynamic_llama_server options api.Options } @@ -40,50 +40,58 @@ var shimMutex sync.Mutex var llm *shimExtServer func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_init(llm.s, sparams, err) + C.dynamic_shim_llama_server_init(llm.s, sparams, err) } func (llm *shimExtServer) llama_server_start() { - C.rocm_shim_llama_server_start(llm.s) + C.dynamic_shim_llama_server_start(llm.s) } func (llm *shimExtServer) llama_server_stop() { - C.rocm_shim_llama_server_stop(llm.s) + C.dynamic_shim_llama_server_stop(llm.s) } func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) { - C.rocm_shim_llama_server_completion(llm.s, json_req, resp) + C.dynamic_shim_llama_server_completion(llm.s, json_req, resp) } func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) { - C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp) + C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp) } func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err) + C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err) } func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) { - C.rocm_shim_llama_server_release_task_result(llm.s, result) + C.dynamic_shim_llama_server_release_task_result(llm.s, result) } func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err) + C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err) } func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err) + C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err) } func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err) + C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err) } func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) { - C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp) + C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp) } -func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { - if !ShimPresent { - return nil, RocmShimMissing +func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { + shimMutex.Lock() + defer shimMutex.Unlock() + libPath := C.CString(library) + defer C.free(unsafe.Pointer(libPath)) + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + var srv C.struct_dynamic_llama_server + C.dynamic_shim_init(libPath, &srv, &resp) + if resp.id < 0 { + return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg)) } - log.Printf("Loading ROCM llm server") - if llm == nil { - return nil, fmt.Errorf("nativeInit wasnt called or libary load failed") + llm = &shimExtServer{ + s: srv, + options: opts, } - llm.options = opts + log.Printf("Loading Dynamic Shim llm server: %s", library) return newExtServer(llm, model, adapters, projectors, numLayers, opts) } @@ -108,64 +116,37 @@ func (llm *shimExtServer) Close() { } func nativeInit(workdir string) error { - err := extractLib(workdir, "llama.cpp/gguf/build/*/lib/*rocm_server*") + libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/lib/*server*") if err != nil { if err == payloadMissing { - log.Printf("%s", RocmShimMissing) + log.Printf("%s", payloadMissing) return nil } return err - } else { - ShimPresent = true + } + for _, lib := range libs { + libName := strings.Split(strings.TrimPrefix(filepath.Base(lib), "lib"), ".")[0] + AvailableShims[libName] = lib } - // Verify we have permissions - either running as root, or we have group access to the driver - fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666) - if err != nil { - if errors.Is(err, fs.ErrPermission) { - log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.") - return err - } else if errors.Is(err, fs.ErrNotExist) { - // expected behavior without a radeon card - return nil + // Only check ROCm access if we have the dynamic lib loaded + if _, rocmPresent := AvailableShims["rocm_server"]; rocmPresent { + // Verify we have permissions - either running as root, or we have group access to the driver + fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666) + if err != nil { + if errors.Is(err, fs.ErrPermission) { + log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.") + return err + } else if errors.Is(err, fs.ErrNotExist) { + // expected behavior without a radeon card + return nil + } + + return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) } + fd.Close() - return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) } - fd.Close() - shimMutex.Lock() - defer shimMutex.Unlock() - if llm != nil { - return nil - } - var libName string - switch runtime.GOOS { - case "darwin": - // shouldn't happen - return nil - case "linux": - libName = "librocm_server.so" - case "windows": - libName = "rocm_server.dll" - default: - // shouldn't happen - return nil - } - libPath := C.CString(filepath.Join(workdir, libName)) - defer C.free(unsafe.Pointer(libPath)) - resp := newExtServerResp(128) - defer freeExtServerResp(resp) - var srv C.struct_rocm_llama_server - C.rocm_shim_init(libPath, &srv, &resp) - if resp.id < 0 { - // TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm - // and run against CPU - return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg)) - } - llm = &shimExtServer{ - s: srv, - options: api.DefaultOptions(), - } return nil } From d9cd3d9667d83f68040378dc2834a49962e08244 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 20 Dec 2023 14:46:15 -0800 Subject: [PATCH 16/19] Revive windows build The windows native setup still needs some more work, but this gets it building again and if you set the PATH properly, you can run the resulting exe on a cuda system. --- llm/ext_server.go | 4 +-- llm/llama.cpp/gen_windows.ps1 | 61 ++++++++++++++++++++++++++++------- 2 files changed, 51 insertions(+), 14 deletions(-) diff --git a/llm/ext_server.go b/llm/ext_server.go index 048b1a65..45251cc5 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -22,8 +22,8 @@ package llm #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm -#cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin -#cgo windows LDFLAGS: -lext_server_shared -lpthread +#cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincpu/dist/lib +#cgo windows LDFLAGS: -lcpu_server -lpthread #include #include "server.h" diff --git a/llm/llama.cpp/gen_windows.ps1 b/llm/llama.cpp/gen_windows.ps1 index f85f1a45..2c77d4ab 100644 --- a/llm/llama.cpp/gen_windows.ps1 +++ b/llm/llama.cpp/gen_windows.ps1 @@ -3,52 +3,89 @@ $ErrorActionPreference = "Stop" function init_vars { - $script:buildDir="gguf/build/wincuda" - $script:installDir="gguf/build/wincuda/dist" $script:patches = @("0001-Expose-callable-API-for-server.patch") - $script:cmakeDefs = @("-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-DLLAMA_K_QUANTS=on", "-DLLAMA_ACCELERATE=on", "-DLLAMA_CUBLAS=ON","-DCMAKE_VERBOSE_MAKEFILE=ON","-DBUILD_SHARED_LIBS=on","-A","x64") + $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-DLLAMA_K_QUANTS=on", "-DLLAMA_ACCELERATE=on", "-A","x64") if ($env:CGO_CFLAGS -contains "-g") { $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on") - $script:config += "RelWithDebInfo" + $script:config = "RelWithDebInfo" } else { - $script:config += "Release" + $script:config = "Release" } } function git_module_setup { # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo & git submodule init + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} & git submodule update --force gguf + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } function apply_patches { rm -erroraction ignore -path "gguf/examples/server/server.h" - foreach ($patch in $patches) { + foreach ($patch in $script:patches) { write-host "Applying patch $patch" & git -C gguf apply ../patches/$patch + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } } function build { - write-host "generating config with: cmake -S gguf -B $buildDir $cmakeDefs" + write-host "generating config with: cmake -S gguf -B $script:buildDir $script:cmakeDefs" & cmake --version - & cmake -S gguf -B $buildDir $cmakeDefs - write-host "building with: cmake --build $buildDir --config $config" - & cmake --build $buildDir --config $config + & cmake -S gguf -B $script:buildDir $script:cmakeDefs + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + write-host "building with: cmake --build $script:buildDir --config $script:config" + & cmake --build $script:buildDir --config $script:config + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } function install { - rm -erroraction ignore -recurse -force -path $installDir - & cmake --install $buildDir --prefix $installDir --config $config + rm -erroraction ignore -recurse -force -path $script:installDir + & cmake --install $script:buildDir --prefix $script:installDir --config $script:config + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } init_vars git_module_setup apply_patches + +# first build CPU based +$script:buildDir="gguf/build/wincpu" +$script:installDir="gguf/build/wincpu/dist" + +build +# install + +md gguf/build/lib -ea 0 +md gguf/build/wincpu/dist/lib -ea 0 +mv gguf/build/wincpu/bin/$script:config/ext_server_shared.dll gguf/build/wincpu/dist/lib/cpu_server.dll + + +# Nope, this barfs on lots of symbol problems +#mv gguf/build/wincpu/examples/server/$script:config/ext_server_shared.dll gguf/build/wincpu/dist/lib/cpu_server.lib +# Nope: this needs lots of include paths to pull in things like msvcprt.lib and other deps +# & cl.exe ` +# gguf/build/wincpu/examples/server/$script:config/ext_server.lib ` +# gguf/build/wincpu/common/$script:config/common.lib ` +# gguf/build/wincpu/$script:config/llama.lib ` +# gguf/build/wincpu/$script:config/ggml_static.lib ` +# /link /DLL /DEF:cpu_server.def /NOENTRY /MACHINE:X64 /OUT:gguf/build/wincpu/dist/lib/cpu_server.dll +# if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + +# Then build cuda as a dynamically loaded library +init_vars +$script:buildDir="gguf/build/wincuda" +$script:installDir="gguf/build/wincuda/dist" +$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DBUILD_SHARED_LIBS=on") build install +cp gguf/build/wincuda/dist/bin/ext_server_shared.dll gguf/build/lib/cuda_server.dll + +# TODO - more to do here to create a usable dll + # TODO - implement ROCm support on windows md gguf/build/winrocm/lib -ea 0 From 325d74985b9f31917ead1585ea22389a39b280b5 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 21 Dec 2023 16:23:36 -0800 Subject: [PATCH 17/19] Fix CPU performance on hyperthreaded systems The default thread count logic was broken and resulted in 2x the number of threads as it should on a hyperthreading CPU resulting in thrashing and poor performance. --- llm/ext_server.go | 7 +------ .../0001-Expose-callable-API-for-server.patch | 14 ++++++++------ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/llm/ext_server.go b/llm/ext_server.go index 45251cc5..0d3327da 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -37,7 +37,6 @@ import ( "fmt" "log" "os" - "runtime" "strings" "sync" "time" @@ -185,11 +184,7 @@ func newExtServer(server extServer, model string, adapters, projectors []string, sparams.mmproj = nil } - if opts.NumThread > 0 { - sparams.n_threads = C.uint(opts.NumThread) - } else { - sparams.n_threads = C.uint(runtime.NumCPU()) - } + sparams.n_threads = C.uint(opts.NumThread) log.Printf("Initializing internal llama server") resp := newExtServerResp(128) diff --git a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch index 2e5a981e..07e42972 100644 --- a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch +++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch @@ -1,4 +1,4 @@ -From b5e195803e2a989e57eef0010adce778df1e2d01 Mon Sep 17 00:00:00 2001 +From 7184ae16e8fd0e9e91cac4c81daa323057fa992b Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 13 Nov 2023 12:25:58 -0800 Subject: [PATCH] Expose callable API for server @@ -6,10 +6,10 @@ Subject: [PATCH] Expose callable API for server This adds an extern "C" interface within the example server --- examples/server/CMakeLists.txt | 24 +++ - examples/server/server.cpp | 274 +++++++++++++++++++++++++++++++++ + examples/server/server.cpp | 276 +++++++++++++++++++++++++++++++++ examples/server/server.h | 89 +++++++++++ ggml-cuda.cu | 1 + - 4 files changed, 388 insertions(+) + 4 files changed, 390 insertions(+) create mode 100644 examples/server/server.h diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt @@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644 +endif() \ No newline at end of file diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index 0403853..2084fd8 100644 +index 0403853..065420c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,6 +5,9 @@ @@ -67,7 +67,7 @@ index 0403853..2084fd8 100644 int main(int argc, char **argv) { #if SERVER_VERBOSE != 1 -@@ -3123,3 +3127,273 @@ int main(int argc, char **argv) +@@ -3123,3 +3127,275 @@ int main(int argc, char **argv) llama_backend_free(); return 0; } @@ -89,7 +89,9 @@ index 0403853..2084fd8 100644 + gpt_params params; + params.n_ctx = sparams->n_ctx; + params.n_batch = sparams->n_batch; -+ params.n_threads = sparams->n_threads; ++ if (sparams->n_threads > 0) { ++ params.n_threads = sparams->n_threads; ++ } + params.n_parallel = sparams->n_parallel; + params.rope_freq_base = sparams->rope_freq_base; + params.rope_freq_scale = sparams->rope_freq_scale; From fa24e73b8253a554ec840395a5d1dfdb91d3598b Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 21 Dec 2023 16:54:54 -0800 Subject: [PATCH 18/19] Remove CPU build, fixup linux build script --- Dockerfile.build | 10 +++--- Dockerfile.cpu | 35 ------------------- llm/llama.cpp/gen_common.sh | 2 +- llm/llama.cpp/gen_linux.sh | 5 +-- .../0001-Expose-callable-API-for-server.patch | 13 ++++--- scripts/build_linux.sh | 14 +++----- 6 files changed, 21 insertions(+), 58 deletions(-) delete mode 100644 Dockerfile.cpu diff --git a/Dockerfile.build b/Dockerfile.build index 6b7e3c4d..c8170919 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -4,6 +4,7 @@ ARG CUDA_VERSION=11.3.1-1 ARG CMAKE_VERSION=3.22.1 # ROCm only supports amd64 ARG ROCM_VERSION=6.0 +ARG CLBLAST_VER=1.6.1 # Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html RUN apt-get update && \ @@ -23,6 +24,10 @@ RUN apt-get update && \ apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev +# CLBlast +RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \ + cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install + ENV ROCM_PATH=/opt/rocm # Ubuntu 22.04 arm64 dependencies @@ -45,7 +50,6 @@ FROM base-${TARGETARCH} ARG TARGETARCH ARG GOFLAGS="'-ldflags -w -s'" ARG CGO_CFLAGS -ARG CLBLAST_VER=1.6.1 ARG GOLANG_VERSION=1.21.3 # Common toolchain @@ -53,10 +57,6 @@ RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10 -# CLBlast -RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \ - cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install - # install go ADD https://dl.google.com/go/go${GOLANG_VERSION}.linux-$TARGETARCH.tar.gz /tmp/go${GOLANG_VERSION}.tar.gz RUN mkdir -p /usr/local && tar xz -C /usr/local /dev/null; then diff --git a/llm/llama.cpp/gen_linux.sh b/llm/llama.cpp/gen_linux.sh index e3cb87a8..3d659fff 100755 --- a/llm/llama.cpp/gen_linux.sh +++ b/llm/llama.cpp/gen_linux.sh @@ -22,13 +22,14 @@ if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then fi COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" OLLAMA_DYN_LIB_DIR="gguf/build/lib" -mkdir -p ${OLLAMA_DYN_LIB_DIR} -touch ${OLLAMA_DYN_LIB_DIR}/.generated source $(dirname $0)/gen_common.sh init_vars git_module_setup apply_patches +mkdir -p ${OLLAMA_DYN_LIB_DIR} +touch ${OLLAMA_DYN_LIB_DIR}/.generated + # # CPU first for the default library # diff --git a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch index 07e42972..ac3fc12a 100644 --- a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch +++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch @@ -1,4 +1,4 @@ -From 7184ae16e8fd0e9e91cac4c81daa323057fa992b Mon Sep 17 00:00:00 2001 +From 4c72576c5f6c2217b1ecf7fd8523616acc5526ae Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 13 Nov 2023 12:25:58 -0800 Subject: [PATCH] Expose callable API for server @@ -6,10 +6,10 @@ Subject: [PATCH] Expose callable API for server This adds an extern "C" interface within the example server --- examples/server/CMakeLists.txt | 24 +++ - examples/server/server.cpp | 276 +++++++++++++++++++++++++++++++++ + examples/server/server.cpp | 279 +++++++++++++++++++++++++++++++++ examples/server/server.h | 89 +++++++++++ ggml-cuda.cu | 1 + - 4 files changed, 390 insertions(+) + 4 files changed, 393 insertions(+) create mode 100644 examples/server/server.h diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt @@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644 +endif() \ No newline at end of file diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index 0403853..065420c 100644 +index 0403853..5e78e4d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,6 +5,9 @@ @@ -67,7 +67,7 @@ index 0403853..065420c 100644 int main(int argc, char **argv) { #if SERVER_VERBOSE != 1 -@@ -3123,3 +3127,275 @@ int main(int argc, char **argv) +@@ -3123,3 +3127,278 @@ int main(int argc, char **argv) llama_backend_free(); return 0; } @@ -80,6 +80,9 @@ index 0403853..065420c 100644 + +void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) +{ ++#if SERVER_VERBOSE != 1 ++ log_disable(); ++#endif + assert(err != NULL && sparams != NULL); + err->id = 0; + err->msg[0] = '\0'; diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 836de6ac..06a2ae1c 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -8,14 +8,8 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version mkdir -p dist for TARGETARCH in amd64 arm64; do - docker buildx build --load --progress=plain --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t gpubuilder:$TARGETARCH . - docker create --platform linux/$TARGETARCH --name gpubuilder-$TARGETARCH gpubuilder:$TARGETARCH - docker cp gpubuilder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH - docker rm gpubuilder-$TARGETARCH - - docker buildx build --load --progress=plain --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.cpu -t cpubuilder:$TARGETARCH . - docker create --platform linux/$TARGETARCH --name cpubuilder-$TARGETARCH cpubuilder:$TARGETARCH - docker cp cpubuilder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH-cpu - docker rm cpubuilder-$TARGETARCH - + docker buildx build --load --progress=plain --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH . + docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH + docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH + docker rm builder-$TARGETARCH done From 495c06e4a67ecc1171faa22115443103a24af1da Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 21 Dec 2023 16:57:58 -0800 Subject: [PATCH 19/19] Fix doc glitch --- docs/modelfile.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modelfile.md b/docs/modelfile.md index 80e896eb..20113090 100644 --- a/docs/modelfile.md +++ b/docs/modelfile.md @@ -188,7 +188,7 @@ SYSTEM """""" ### ADAPTER -The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGUF file format. The adapter should be tuned from the base model otherwise the behaviour is undefined. +The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined. ```modelfile ADAPTER ./ollama-lora.bin