diff --git a/.gitignore b/.gitignore index ef88e17e..b116b785 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,16 @@ dist __pycache__ ollama ggml-metal.metal + +# cmake gitignore +CMakeLists.txt.user +CMakeCache.txt +CMakeFiles +CMakeScripts +Testing +Makefile +cmake_install.cmake +install_manifest.txt +compile_commands.json +CTestTestfile.cmake +_deps diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..e05bf02a --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,43 @@ +cmake_minimum_required(VERSION 3.12) +project(ollama) + +include(FetchContent) + +FetchContent_Declare( + "llama.cpp" + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git + GIT_TAG 55dbb91 +) + +FetchContent_MakeAvailable(llama.cpp) + +add_custom_target( + ollama + ALL + DEPENDS + ${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal + COMMAND + ${CMAKE_COMMAND} -E + env + CGO_CPPFLAGS='-I${llama.cpp_SOURCE_DIR}' + CGO_LDFLAGS='-L${llama.cpp_BINARY_DIR} -lllama -lggml_static -lm -lstdc++' + CGO_CXXFLAGS='-std=c++11' + -- + go build . + WORKING_DIRECTORY + ${CMAKE_CURRENT_SOURCE_DIR} +) + +add_custom_command( + OUTPUT + ${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal + COMMAND + ${CMAKE_COMMAND} -E + copy_if_different + ${llama.cpp_SOURCE_DIR}/ggml-metal.metal + ${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal + WORKING_DIRECTORY + ${CMAKE_CURRENT_SOURCE_DIR} +) + +add_dependencies(ollama llama ggml_static) diff --git a/Makefile b/Makefile deleted file mode 100644 index 2ba9ed2b..00000000 --- a/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -default: ollama - -.PHONY: llama -llama: - cmake -S llama -B llama/build -DLLAMA_METAL=on - cmake --build llama/build - -.PHONY: ollama -ollama: llama - go build . - -.PHONY: app -app: ollama - npm install --prefix app - npm run --prefix app make:sign - -clean: - go clean - rm -rf llama/build diff --git a/api/types.go b/api/types.go index 3443ae9b..e0f1f4da 100644 --- a/api/types.go +++ b/api/types.go @@ -1,5 +1,7 @@ package api +import "runtime" + type PullRequest struct { Model string `json:"model"` } @@ -14,93 +16,76 @@ type GenerateRequest struct { Model string `json:"model"` Prompt string `json:"prompt"` - ModelOptions *ModelOptions `json:"model_opts,omitempty"` - PredictOptions *PredictOptions `json:"predict_opts,omitempty"` -} - -type ModelOptions struct { - ContextSize int `json:"context_size,omitempty"` - Seed int `json:"seed,omitempty"` - NBatch int `json:"n_batch,omitempty"` - F16Memory bool `json:"memory_f16,omitempty"` - MLock bool `json:"mlock,omitempty"` - MMap bool `json:"mmap,omitempty"` - VocabOnly bool `json:"vocab_only,omitempty"` - LowVRAM bool `json:"low_vram,omitempty"` - Embeddings bool `json:"embeddings,omitempty"` - NUMA bool `json:"numa,omitempty"` - NGPULayers int `json:"gpu_layers,omitempty"` - MainGPU string `json:"main_gpu,omitempty"` - TensorSplit string `json:"tensor_split,omitempty"` -} - -type PredictOptions struct { - Seed int `json:"seed,omitempty"` - Threads int `json:"threads,omitempty"` - Tokens int `json:"tokens,omitempty"` - TopK int `json:"top_k,omitempty"` - Repeat int `json:"repeat,omitempty"` - Batch int `json:"batch,omitempty"` - NKeep int `json:"nkeep,omitempty"` - TopP float64 `json:"top_p,omitempty"` - Temperature float64 `json:"temp,omitempty"` - Penalty float64 `json:"penalty,omitempty"` - F16KV bool - DebugMode bool - StopPrompts []string - IgnoreEOS bool `json:"ignore_eos,omitempty"` - - TailFreeSamplingZ float64 `json:"tfs_z,omitempty"` - TypicalP float64 `json:"typical_p,omitempty"` - FrequencyPenalty float64 `json:"freq_penalty,omitempty"` - PresencePenalty float64 `json:"pres_penalty,omitempty"` - Mirostat int `json:"mirostat,omitempty"` - MirostatETA float64 `json:"mirostat_lr,omitempty"` - MirostatTAU float64 `json:"mirostat_ent,omitempty"` - PenalizeNL bool `json:"penalize_nl,omitempty"` - LogitBias string `json:"logit_bias,omitempty"` - - PathPromptCache string - MLock bool `json:"mlock,omitempty"` - MMap bool `json:"mmap,omitempty"` - PromptCacheAll bool - PromptCacheRO bool - MainGPU string - TensorSplit string -} - -var DefaultModelOptions ModelOptions = ModelOptions{ - ContextSize: 512, - Seed: 0, - F16Memory: true, - MLock: false, - Embeddings: true, - MMap: true, - LowVRAM: false, -} - -var DefaultPredictOptions PredictOptions = PredictOptions{ - Seed: -1, - Threads: -1, - Tokens: 512, - Penalty: 1.1, - Repeat: 64, - Batch: 512, - NKeep: 64, - TopK: 90, - TopP: 0.86, - TailFreeSamplingZ: 1.0, - TypicalP: 1.0, - Temperature: 0.8, - FrequencyPenalty: 0.0, - PresencePenalty: 0.0, - Mirostat: 0, - MirostatTAU: 5.0, - MirostatETA: 0.1, - MMap: true, - StopPrompts: []string{"llama"}, + Options `json:"options"` } type GenerateResponse struct { Response string `json:"response"` } + +type Options struct { + Seed int `json:"seed,omitempty"` + + // Backend options + UseNUMA bool `json:"numa,omitempty"` + + // Model options + NumCtx int `json:"num_ctx,omitempty"` + NumBatch int `json:"num_batch,omitempty"` + NumGPU int `json:"num_gpu,omitempty"` + MainGPU int `json:"main_gpu,omitempty"` + LowVRAM bool `json:"low_vram,omitempty"` + F16KV bool `json:"f16_kv,omitempty"` + LogitsAll bool `json:"logits_all,omitempty"` + VocabOnly bool `json:"vocab_only,omitempty"` + UseMMap bool `json:"use_mmap,omitempty"` + UseMLock bool `json:"use_mlock,omitempty"` + EmbeddingOnly bool `json:"embedding_only,omitempty"` + + // Predict options + RepeatLastN int `json:"repeat_last_n,omitempty"` + RepeatPenalty float32 `json:"repeat_penalty,omitempty"` + FrequencyPenalty float32 `json:"frequency_penalty,omitempty"` + PresencePenalty float32 `json:"presence_penalty,omitempty"` + Temperature float32 `json:"temperature,omitempty"` + TopK int `json:"top_k,omitempty"` + TopP float32 `json:"top_p,omitempty"` + TFSZ float32 `json:"tfs_z,omitempty"` + TypicalP float32 `json:"typical_p,omitempty"` + Mirostat int `json:"mirostat,omitempty"` + MirostatTau float32 `json:"mirostat_tau,omitempty"` + MirostatEta float32 `json:"mirostat_eta,omitempty"` + + NumThread int `json:"num_thread,omitempty"` +} + +func DefaultOptions() Options { + return Options{ + Seed: -1, + + UseNUMA: false, + + NumCtx: 512, + NumBatch: 512, + NumGPU: 1, + LowVRAM: false, + F16KV: true, + UseMMap: true, + UseMLock: false, + + RepeatLastN: 512, + RepeatPenalty: 1.1, + FrequencyPenalty: 0.0, + PresencePenalty: 0.0, + Temperature: 0.8, + TopK: 40, + TopP: 0.9, + TFSZ: 1.0, + TypicalP: 1.0, + Mirostat: 0, + MirostatTau: 5.0, + MirostatEta: 0.1, + + NumThread: runtime.NumCPU(), + } +} diff --git a/go.mod b/go.mod index c2e15346..8beb32bd 100644 --- a/go.mod +++ b/go.mod @@ -39,6 +39,7 @@ require ( golang.org/x/arch v0.3.0 // indirect golang.org/x/crypto v0.10.0 // indirect golang.org/x/net v0.10.0 // indirect + golang.org/x/sync v0.3.0 golang.org/x/sys v0.10.0 // indirect golang.org/x/term v0.10.0 golang.org/x/text v0.10.0 // indirect diff --git a/go.sum b/go.sum index 2adee49d..9189b115 100644 --- a/go.sum +++ b/go.sum @@ -99,6 +99,8 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt deleted file mode 100644 index 3ea66d7e..00000000 --- a/llama/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -cmake_minimum_required(VERSION 3.12) -project(binding) - -include(FetchContent) - -FetchContent_Declare( - llama_cpp - GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG 55dbb91 -) - -FetchContent_MakeAvailable(llama_cpp) - -add_library(binding ${CMAKE_CURRENT_SOURCE_DIR}/binding/binding.cpp ${llama_cpp_SOURCE_DIR}/examples/common.cpp) -target_include_directories(binding PRIVATE ${llama_cpp_SOURCE_DIR}/examples) -target_link_libraries(binding llama ggml_static) - -if (LLAMA_METAL) - configure_file(${llama_cpp_SOURCE_DIR}/ggml-metal.metal ${CMAKE_CURRENT_BINARY_DIR}/../../ggml-metal.metal COPYONLY) -endif() - -add_custom_target(copy_libllama ALL COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${CMAKE_CURRENT_BINARY_DIR}) -add_custom_target(copy_libggml_static ALL COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/llama/binding/binding.cpp b/llama/binding/binding.cpp deleted file mode 100644 index 50d9e018..00000000 --- a/llama/binding/binding.cpp +++ /dev/null @@ -1,705 +0,0 @@ -// MIT License - -// Copyright (c) 2023 go-skynet authors - -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "common.h" -#include "llama.h" - -#include "binding.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) -#include -#include -#elif defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#define NOMINMAX -#include -#include -#endif - -#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || \ - defined(_WIN32) -void sigint_handler(int signo) { - if (signo == SIGINT) { - _exit(130); - } -} -#endif - -int get_embeddings(void *params_ptr, void *state_pr, float *res_embeddings) { - gpt_params *params_p = (gpt_params *)params_ptr; - llama_context *ctx = (llama_context *)state_pr; - gpt_params params = *params_p; - - if (params.seed <= 0) { - params.seed = time(NULL); - } - - std::mt19937 rng(params.seed); - - llama_init_backend(params.numa); - - int n_past = 0; - - // Add a space in front of the first character to match OG llama tokenizer - // behavior - params.prompt.insert(0, 1, ' '); - - // tokenize the prompt - auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); - - // determine newline token - auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); - - if (embd_inp.size() > 0) { - if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, - params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } - } - - const int n_embd = llama_n_embd(ctx); - - const auto embeddings = llama_get_embeddings(ctx); - - for (int i = 0; i < n_embd; i++) { - res_embeddings[i] = embeddings[i]; - } - - return 0; -} - -int get_token_embeddings(void *params_ptr, void *state_pr, int *tokens, - int tokenSize, float *res_embeddings) { - gpt_params *params_p = (gpt_params *)params_ptr; - llama_context *ctx = (llama_context *)state_pr; - gpt_params params = *params_p; - - for (int i = 0; i < tokenSize; i++) { - auto token_str = llama_token_to_str(ctx, tokens[i]); - if (token_str == nullptr) { - continue; - } - std::vector my_vector; - std::string str_token(token_str); // create a new std::string from the char* - params_p->prompt += str_token; - } - - return get_embeddings(params_ptr, state_pr, res_embeddings); -} - -int eval(void *params_ptr, void *state_pr, char *text) { - gpt_params *params_p = (gpt_params *)params_ptr; - llama_context *ctx = (llama_context *)state_pr; - - auto n_past = 0; - auto last_n_tokens_data = - std::vector(params_p->repeat_last_n, 0); - - auto tokens = std::vector(params_p->n_ctx); - auto n_prompt_tokens = - llama_tokenize(ctx, text, tokens.data(), tokens.size(), true); - - if (n_prompt_tokens < 1) { - fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); - return 1; - } - - // evaluate prompt - return llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, - params_p->n_threads); -} - -int llama_predict(void *params_ptr, void *state_pr, char *result, bool debug) { - gpt_params *params_p = (gpt_params *)params_ptr; - llama_context *ctx = (llama_context *)state_pr; - - gpt_params params = *params_p; - - const int n_ctx = llama_n_ctx(ctx); - - if (params.seed <= 0) { - params.seed = time(NULL); - } - - std::mt19937 rng(params.seed); - - std::string path_session = params.path_prompt_cache; - std::vector session_tokens; - - if (!path_session.empty()) { - if (debug) { - fprintf(stderr, "%s: attempting to load saved session from '%s'\n", - __func__, path_session.c_str()); - } - // fopen to check for existing session - FILE *fp = std::fopen(path_session.c_str(), "rb"); - if (fp != NULL) { - std::fclose(fp); - - session_tokens.resize(n_ctx); - size_t n_token_count_out = 0; - if (!llama_load_session_file( - ctx, path_session.c_str(), session_tokens.data(), - session_tokens.capacity(), &n_token_count_out)) { - fprintf(stderr, "%s: error: failed to load session file '%s'\n", - __func__, path_session.c_str()); - return 1; - } - session_tokens.resize(n_token_count_out); - llama_set_rng_seed(ctx, params.seed); - if (debug) { - fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", - __func__, (int)session_tokens.size()); - } - } else { - if (debug) { - fprintf(stderr, "%s: session file does not exist, will create\n", - __func__); - } - } - } - - std::vector embd_inp; - if (!params.prompt.empty() || session_tokens.empty()) { - // Add a space in front of the first character to match OG llama tokenizer - // behavior - params.prompt.insert(0, 1, ' '); - - embd_inp = ::llama_tokenize(ctx, params.prompt, true); - } else { - embd_inp = session_tokens; - } - - // debug message about similarity of saved session, if applicable - size_t n_matching_session_tokens = 0; - if (session_tokens.size()) { - for (llama_token id : session_tokens) { - if (n_matching_session_tokens >= embd_inp.size() || - id != embd_inp[n_matching_session_tokens]) { - break; - } - n_matching_session_tokens++; - } - if (debug) { - if (params.prompt.empty() && - n_matching_session_tokens == embd_inp.size()) { - fprintf(stderr, "%s: using full prompt from session file\n", __func__); - } else if (n_matching_session_tokens >= embd_inp.size()) { - fprintf(stderr, "%s: session file has exact match for prompt!\n", - __func__); - } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { - fprintf(stderr, - "%s: warning: session file has low similarity to prompt (%zu / " - "%zu tokens); will mostly be reevaluated\n", - __func__, n_matching_session_tokens, embd_inp.size()); - } else { - fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n", - __func__, n_matching_session_tokens, embd_inp.size()); - } - } - } - // if we will use the cache for the full prompt without reaching the end of - // the cache, force reevaluation of the last token token to recalculate the - // cached logits - if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && - session_tokens.size() > embd_inp.size()) { - session_tokens.resize(embd_inp.size() - 1); - } - // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) { - params.n_keep = (int)embd_inp.size(); - } - - // determine newline token - auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); - - // TODO: replace with ring-buffer - std::vector last_n_tokens(n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - - bool need_to_save_session = - !path_session.empty() && n_matching_session_tokens < embd_inp.size(); - int n_past = 0; - int n_remain = params.n_predict; - int n_consumed = 0; - int n_session_consumed = 0; - - std::vector embd; - std::string res = ""; - - // do one empty run to warm up the model - { - const std::vector tmp = { - llama_token_bos(), - }; - llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); - llama_reset_timings(ctx); - } - - while (n_remain != 0) { - // predict - if (embd.size() > 0) { - // infinite text generation via context swapping - // if we run out of context: - // - take the n_keep first tokens from the original prompt (via n_past) - // - take half of the last (n_ctx - n_keep) tokens and recompute the - // logits in batches - if (n_past + (int)embd.size() > n_ctx) { - const int n_left = n_past - params.n_keep; - - // always keep the first token - BOS - n_past = std::max(1, params.n_keep); - - // insert n_left/2 tokens at the start of embd from last_n_tokens - embd.insert(embd.begin(), - last_n_tokens.begin() + n_ctx - n_left / 2 - embd.size(), - last_n_tokens.end() - embd.size()); - - // stop saving session if we run out of context - path_session.clear(); - - // printf("\n---\n"); - // printf("resetting: '"); - // for (int i = 0; i < (int) embd.size(); i++) { - // printf("%s", llama_token_to_str(ctx, embd[i])); - // } - // printf("'\n"); - // printf("\n---\n"); - } - - // try to reuse a matching prefix from the loaded session instead of - // re-eval (via n_past) - if (n_session_consumed < (int)session_tokens.size()) { - size_t i = 0; - for (; i < embd.size(); i++) { - if (embd[i] != session_tokens[n_session_consumed]) { - session_tokens.resize(n_session_consumed); - break; - } - - n_past++; - n_session_consumed++; - - if (n_session_consumed >= (int)session_tokens.size()) { - ++i; - break; - } - } - if (i > 0) { - embd.erase(embd.begin(), embd.begin() + i); - } - } - - // evaluate tokens in batches - // embd is typically prepared beforehand to fit within a batch, but not - // always - for (int i = 0; i < (int)embd.size(); i += params.n_batch) { - int n_eval = (int)embd.size() - i; - if (n_eval > params.n_batch) { - n_eval = params.n_batch; - } - if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } - n_past += n_eval; - } - - if (embd.size() > 0 && !path_session.empty()) { - session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); - n_session_consumed = session_tokens.size(); - } - } - - embd.clear(); - - if ((int)embd_inp.size() <= n_consumed) { - // out of user input, sample next token - const float temp = params.temp; - const int32_t top_k = - params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; - const float top_p = params.top_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const int32_t repeat_last_n = - params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - const float alpha_presence = params.presence_penalty; - const float alpha_frequency = params.frequency_penalty; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; - - // optionally save the session on first sample (for faster prompt loading - // next time) - if (!path_session.empty() && need_to_save_session && - !params.prompt_cache_ro) { - need_to_save_session = false; - llama_save_session_file(ctx, path_session.c_str(), - session_tokens.data(), session_tokens.size()); - } - - llama_token id = 0; - - { - auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); - - // Apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); - it++) { - logits[it->first] += it->second; - } - - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back( - llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array candidates_p = {candidates.data(), - candidates.size(), false}; - - // Apply penalties - float nl_logit = logits[llama_token_nl()]; - auto last_n_repeat = - std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); - llama_sample_repetition_penalty( - ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, repeat_penalty); - llama_sample_frequency_and_presence_penalties( - ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, alpha_frequency, alpha_presence); - if (!penalize_nl) { - logits[llama_token_nl()] = nl_logit; - } - - if (temp <= 0) { - // Greedy sampling - id = llama_sample_token_greedy(ctx, &candidates_p); - } else { - if (mirostat == 1) { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, - mirostat_eta, mirostat_m, - &mirostat_mu); - } else if (mirostat == 2) { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat_v2( - ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } else { - // Temperature sampling - llama_sample_top_k(ctx, &candidates_p, top_k, 1); - llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); - llama_sample_typical(ctx, &candidates_p, typical_p, 1); - llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token(ctx, &candidates_p); - } - } - // printf("`%d`", candidates_p.size); - - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - } - - // add it to the context - embd.push_back(id); - - // decrement remaining sampling budget - --n_remain; - - // call the token callback, no need to check if one is actually - // registered, that will be handled on the Go side. - auto token_str = llama_token_to_str(ctx, id); - if (!tokenCallback(state_pr, (char *)token_str)) { - break; - } - } else { - // some user input remains from prompt or interaction, forward it to - // processing - while ((int)embd_inp.size() > n_consumed) { - embd.push_back(embd_inp[n_consumed]); - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(embd_inp[n_consumed]); - ++n_consumed; - if ((int)embd.size() >= params.n_batch) { - break; - } - } - } - - for (auto id : embd) { - res += llama_token_to_str(ctx, id); - } - - // check for stop prompt - if (params.antiprompt.size()) { - std::string last_output; - for (auto id : last_n_tokens) { - last_output += llama_token_to_str(ctx, id); - } - // Check if each of the reverse prompts appears at the end of the output. - for (std::string &antiprompt : params.antiprompt) { - // size_t extra_padding = params.interactive ? 0 : 2; - size_t extra_padding = 2; - size_t search_start_pos = - last_output.length() > - static_cast(antiprompt.length() + extra_padding) - ? last_output.length() - - static_cast(antiprompt.length() + extra_padding) - : 0; - - if (last_output.find(antiprompt.c_str(), search_start_pos) != - std::string::npos) { - goto end; - } - } - } - - // end of text token - if (!embd.empty() && embd.back() == llama_token_eos()) { - break; - } - } - - if (!path_session.empty() && params.prompt_cache_all && - !params.prompt_cache_ro) { - if (debug) { - fprintf(stderr, "\n%s: saving final output to session file '%s'\n", - __func__, path_session.c_str()); - } - llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), - session_tokens.size()); - } - -end: -#if defined(_WIN32) - signal(SIGINT, SIG_DFL); -#endif - - if (debug) { - llama_print_timings(ctx); - llama_reset_timings(ctx); - } - - strcpy(result, res.c_str()); - return 0; -} - -void llama_binding_free_model(void *state_ptr) { - llama_context *ctx = (llama_context *)state_ptr; - llama_free(ctx); -} - -void llama_free_params(void *params_ptr) { - gpt_params *params = (gpt_params *)params_ptr; - delete params; -} - -int load_state(void *ctx, char *statefile, char *modes) { - llama_context *state = (llama_context *)ctx; - const llama_context *constState = static_cast(state); - const size_t state_size = llama_get_state_size(state); - uint8_t *state_mem = new uint8_t[state_size]; - - { - FILE *fp_read = fopen(statefile, modes); - if (state_size != llama_get_state_size(constState)) { - fprintf(stderr, "\n%s : failed to validate state size\n", __func__); - return 1; - } - - const size_t ret = fread(state_mem, 1, state_size, fp_read); - if (ret != state_size) { - fprintf(stderr, "\n%s : failed to read state\n", __func__); - return 1; - } - - llama_set_state_data( - state, state_mem); // could also read directly from memory mapped file - fclose(fp_read); - } - - return 0; -} - -void save_state(void *ctx, char *dst, char *modes) { - llama_context *state = (llama_context *)ctx; - - const size_t state_size = llama_get_state_size(state); - uint8_t *state_mem = new uint8_t[state_size]; - - // Save state (rng, logits, embedding and kv_cache) to file - { - FILE *fp_write = fopen(dst, modes); - llama_copy_state_data( - state, state_mem); // could also copy directly to memory mapped file - fwrite(state_mem, 1, state_size, fp_write); - fclose(fp_write); - } -} - -void *llama_allocate_params( - const char *prompt, int seed, int threads, int tokens, int top_k, - float top_p, float temp, float repeat_penalty, int repeat_last_n, - bool ignore_eos, bool memory_f16, int n_batch, int n_keep, - const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p, - float frequency_penalty, float presence_penalty, int mirostat, - float mirostat_eta, float mirostat_tau, bool penalize_nl, - const char *logit_bias, bool mlock, bool mmap, const char *maingpu, - const char *tensorsplit) { - gpt_params *params = new gpt_params; - params->seed = seed; - params->n_threads = threads; - params->n_predict = tokens; - params->repeat_last_n = repeat_last_n; - params->top_k = top_k; - params->top_p = top_p; - params->memory_f16 = memory_f16; - params->temp = temp; - params->use_mmap = mmap; - params->use_mlock = mlock; - params->repeat_penalty = repeat_penalty; - params->n_batch = n_batch; - params->n_keep = n_keep; - if (maingpu[0] != '\0') { - params->main_gpu = std::stoi(maingpu); - } - - if (tensorsplit[0] != '\0') { - std::string arg_next = tensorsplit; - // split string by , and / - const std::regex regex{R"([,/]+)"}; - std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; - std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { - if (i < split_arg.size()) { - params->tensor_split[i] = std::stof(split_arg[i]); - } else { - params->tensor_split[i] = 0.0f; - } - } - } - - if (ignore_eos) { - params->logit_bias[llama_token_eos()] = -INFINITY; - } - - for (int i = 0; i < antiprompt_count; i++) { - params->antiprompt.push_back(antiprompt[i]); - } - - params->tfs_z = tfs_z; - params->typical_p = typical_p; - params->presence_penalty = presence_penalty; - params->mirostat = mirostat; - params->mirostat_eta = mirostat_eta; - params->mirostat_tau = mirostat_tau; - params->penalize_nl = penalize_nl; - std::stringstream ss(logit_bias); - llama_token key; - char sign; - std::string value_str; - if (ss >> key && ss >> sign && std::getline(ss, value_str) && - (sign == '+' || sign == '-')) { - params->logit_bias[key] = - std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - } - params->frequency_penalty = frequency_penalty; - params->prompt = prompt; - - return params; -} - -void *load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, - bool mlock, bool embeddings, bool mmap, bool low_vram, - bool vocab_only, int n_gpu_layers, int n_batch, - const char *maingpu, const char *tensorsplit, bool numa) { - // load the model - auto lparams = llama_context_default_params(); - - lparams.n_ctx = n_ctx; - lparams.seed = n_seed; - lparams.f16_kv = memory_f16; - lparams.embedding = embeddings; - lparams.use_mlock = mlock; - lparams.n_gpu_layers = n_gpu_layers; - lparams.use_mmap = mmap; - lparams.low_vram = low_vram; - lparams.vocab_only = vocab_only; - - if (maingpu[0] != '\0') { - lparams.main_gpu = std::stoi(maingpu); - } - - if (tensorsplit[0] != '\0') { - std::string arg_next = tensorsplit; - // split string by , and / - const std::regex regex{R"([,/]+)"}; - std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; - std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { - if (i < split_arg.size()) { - lparams.tensor_split[i] = std::stof(split_arg[i]); - } else { - lparams.tensor_split[i] = 0.0f; - } - } - } - - lparams.n_batch = n_batch; - - llama_init_backend(numa); - void *res = nullptr; - try { - res = llama_init_from_file(fname, lparams); - } catch (std::runtime_error &e) { - fprintf(stderr, "failed %s", e.what()); - return res; - } - - return res; -} \ No newline at end of file diff --git a/llama/binding/binding.h b/llama/binding/binding.h deleted file mode 100644 index 79aa3142..00000000 --- a/llama/binding/binding.h +++ /dev/null @@ -1,69 +0,0 @@ -// MIT License - -// Copyright (c) 2023 go-skynet authors - -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#ifdef __cplusplus - -extern "C" { - -#endif - -#include - -extern unsigned char tokenCallback(void *, char *); - -int load_state(void *ctx, char *statefile, char *modes); - -int eval(void *params_ptr, void *ctx, char *text); - -void save_state(void *ctx, char *dst, char *modes); - -void *load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, - bool mlock, bool embeddings, bool mmap, bool low_vram, - bool vocab_only, int n_gpu, int n_batch, const char *maingpu, - const char *tensorsplit, bool numa); - -int get_embeddings(void *params_ptr, void *state_pr, float *res_embeddings); - -int get_token_embeddings(void *params_ptr, void *state_pr, int *tokens, - int tokenSize, float *res_embeddings); - -void *llama_allocate_params( - const char *prompt, int seed, int threads, int tokens, int top_k, - float top_p, float temp, float repeat_penalty, int repeat_last_n, - bool ignore_eos, bool memory_f16, int n_batch, int n_keep, - const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p, - float frequency_penalty, float presence_penalty, int mirostat, - float mirostat_eta, float mirostat_tau, bool penalize_nl, - const char *logit_bias, bool mlock, bool mmap, const char *maingpu, - const char *tensorsplit); - -void llama_free_params(void *params_ptr); - -void llama_binding_free_model(void *state); - -int llama_predict(void *params_ptr, void *state_pr, char *result, bool debug); - -#ifdef __cplusplus - -} - -#endif diff --git a/llama/llama.go b/llama/llama.go index efa5c0d9..e230009c 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -1,215 +1,231 @@ -// MIT License - -// Copyright (c) 2023 go-skynet authors - -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - package llama -// #cgo LDFLAGS: -Lbuild -lbinding -lllama -lm -lggml_static -lstdc++ -// #cgo CXXFLAGS: -std=c++11 -// #cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders -// #include "binding/binding.h" -// #include -import "C" +/* +#cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders +#include +#include "llama.h" +struct llama_sample_options +{ + float repeat_penalty; + float frequency_penalty; + float presence_penalty; + float temperature; + int32_t top_k; + float top_p; + float tfs_z; + float typical_p; + int mirostat; + float mirostat_tau; + float mirostat_eta; +}; + +llama_token llama_sample( + struct llama_context *ctx, + struct llama_token_data *candidates, + size_t n_candidates, + const llama_token *last_tokens, + size_t n_last_tokens, + struct llama_sample_options *opts) +{ + llama_token_data_array candidates_p = { + candidates, + n_candidates, + false, + }; + + llama_sample_repetition_penalty( + ctx, &candidates_p, + last_tokens, n_last_tokens, + opts->repeat_penalty); + + llama_sample_frequency_and_presence_penalties( + ctx, &candidates_p, + last_tokens, n_last_tokens, + opts->frequency_penalty, opts->presence_penalty); + + if (opts->temperature <= 0) { + return llama_sample_token_greedy(ctx, &candidates_p); + } + + if (opts->mirostat == 1) { + int mirostat_m = 100; + float mirostat_mu = 2.0f * opts->mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, opts->temperature); + return llama_sample_token_mirostat( + ctx, &candidates_p, + opts->mirostat_tau, opts->mirostat_eta, + mirostat_m, &mirostat_mu); + } else if (opts->mirostat == 2) { + float mirostat_mu = 2.0f * opts->mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, opts->temperature); + return llama_sample_token_mirostat_v2( + ctx, &candidates_p, + opts->mirostat_tau, opts->mirostat_eta, + &mirostat_mu); + } else { + llama_sample_top_k(ctx, &candidates_p, opts->top_k, 1); + llama_sample_tail_free(ctx, &candidates_p, opts->tfs_z, 1); + llama_sample_typical(ctx, &candidates_p, opts->typical_p, 1); + llama_sample_top_p(ctx, &candidates_p, opts->top_p, 1); + llama_sample_temperature(ctx, &candidates_p, opts->temperature); + return llama_sample_token(ctx, &candidates_p); + } +} +*/ +import "C" import ( - "fmt" + "errors" + "io" + "os" "strings" - "sync" "unsafe" + + "github.com/jmorganca/ollama/api" ) -type LLama struct { - ctx unsafe.Pointer - embeddings bool - contextSize int +type llama struct { + params *C.struct_llama_context_params + model *C.struct_llama_model + ctx *C.struct_llama_context + + api.Options } -func New(model string, mo ModelOptions) (*LLama, error) { - modelPath := C.CString(model) - defer C.free(unsafe.Pointer(modelPath)) - - ctx := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM), C.bool(mo.VocabOnly), C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA)) - if ctx == nil { - return nil, fmt.Errorf("failed loading model") +func New(model string, opts api.Options) (*llama, error) { + if _, err := os.Stat(model); err != nil { + return nil, err } - ll := &LLama{ctx: ctx, contextSize: mo.ContextSize, embeddings: mo.Embeddings} + llm := llama{Options: opts} - return ll, nil + C.llama_init_backend(C.bool(llm.UseNUMA)) + + params := C.llama_context_default_params() + params.seed = C.uint(llm.Seed) + params.n_ctx = C.int(llm.NumCtx) + params.n_batch = C.int(llm.NumBatch) + params.n_gpu_layers = C.int(llm.NumGPU) + params.main_gpu = C.int(llm.MainGPU) + params.low_vram = C.bool(llm.LowVRAM) + params.f16_kv = C.bool(llm.F16KV) + params.logits_all = C.bool(llm.LogitsAll) + params.vocab_only = C.bool(llm.VocabOnly) + params.use_mmap = C.bool(llm.UseMMap) + params.use_mlock = C.bool(llm.UseMLock) + params.embedding = C.bool(llm.EmbeddingOnly) + llm.params = ¶ms + + cModel := C.CString(model) + defer C.free(unsafe.Pointer(cModel)) + + llm.model = C.llama_load_model_from_file(cModel, params) + llm.ctx = C.llama_new_context_with_model(llm.model, params) + + // warm up the model + bos := []C.llama_token{C.llama_token_bos()} + C.llama_eval(llm.ctx, unsafe.SliceData(bos), C.int(len(bos)), 0, C.int(opts.NumThread)) + C.llama_reset_timings(llm.ctx) + + return &llm, nil } -func (l *LLama) Free() { - C.llama_binding_free_model(l.ctx) +func (llm *llama) Close() { + defer C.llama_free_model(llm.model) + defer C.llama_free(llm.ctx) + + C.llama_print_timings(llm.ctx) } -func (l *LLama) Eval(text string, po PredictOptions) error { - input := C.CString(text) - if po.Tokens == 0 { - po.Tokens = 99999999 - } - defer C.free(unsafe.Pointer(input)) - - reverseCount := len(po.StopPrompts) - reversePrompt := make([]*C.char, reverseCount) - var pass **C.char - for i, s := range po.StopPrompts { - cs := C.CString(s) - reversePrompt[i] = cs - pass = &reversePrompt[0] - defer C.free(unsafe.Pointer(cs)) +func (llm *llama) Predict(prompt string, fn func(string)) error { + if tokens := llm.tokenize(prompt); tokens != nil { + return llm.generate(tokens, fn) } - cLogitBias := C.CString(po.LogitBias) - defer C.free(unsafe.Pointer(cLogitBias)) + return errors.New("llama: tokenize") +} - cMainGPU := C.CString(po.MainGPU) - defer C.free(unsafe.Pointer(cMainGPU)) +func (llm *llama) tokenize(prompt string) []C.llama_token { + cPrompt := C.CString(prompt) + defer C.free(unsafe.Pointer(cPrompt)) - cTensorSplit := C.CString(po.TensorSplit) - defer C.free(unsafe.Pointer(cTensorSplit)) - - params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), - C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), - C.bool(po.IgnoreEOS), C.bool(po.F16KV), - C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount), - C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty), - C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), cLogitBias, - C.bool(po.MLock), C.bool(po.MMap), cMainGPU, cTensorSplit, - ) - defer C.llama_free_params(params) - - ret := C.eval(params, l.ctx, input) - if ret != 0 { - return fmt.Errorf("inference failed") + tokens := make([]C.llama_token, llm.NumCtx) + if n := C.llama_tokenize(llm.ctx, cPrompt, unsafe.SliceData(tokens), C.int(len(tokens)), true); n > 0 { + return tokens[:n] } return nil } -func (l *LLama) Predict(text string, po PredictOptions) (string, error) { - if po.TokenCallback != nil { - setCallback(l.ctx, po.TokenCallback) +func (llm *llama) detokenize(tokens ...C.llama_token) string { + var sb strings.Builder + for _, token := range tokens { + sb.WriteString(C.GoString(C.llama_token_to_str(llm.ctx, token))) } - input := C.CString(text) - if po.Tokens == 0 { - po.Tokens = 99999999 - } - defer C.free(unsafe.Pointer(input)) - - out := make([]byte, po.Tokens) - - reverseCount := len(po.StopPrompts) - reversePrompt := make([]*C.char, reverseCount) - var pass **C.char - for i, s := range po.StopPrompts { - cs := C.CString(s) - reversePrompt[i] = cs - pass = &reversePrompt[0] - defer C.free(unsafe.Pointer(cs)) - } - - cLogitBias := C.CString(po.LogitBias) - defer C.free(unsafe.Pointer(cLogitBias)) - - cMainGPU := C.CString(po.MainGPU) - defer C.free(unsafe.Pointer(cMainGPU)) - - cTensorSplit := C.CString(po.TensorSplit) - defer C.free(unsafe.Pointer(cTensorSplit)) - - params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), - C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), - C.bool(po.IgnoreEOS), C.bool(po.F16KV), - C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount), - C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty), - C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), cLogitBias, - C.bool(po.MLock), C.bool(po.MMap), cMainGPU, cTensorSplit, - ) - defer C.llama_free_params(params) - - ret := C.llama_predict(params, l.ctx, (*C.char)(unsafe.Pointer(&out[0])), C.bool(po.DebugMode)) - if ret != 0 { - return "", fmt.Errorf("inference failed") - } - res := C.GoString((*C.char)(unsafe.Pointer(&out[0]))) - - res = strings.TrimPrefix(res, " ") - res = strings.TrimPrefix(res, text) - res = strings.TrimPrefix(res, "\n") - - for _, s := range po.StopPrompts { - res = strings.TrimRight(res, s) - } - - if po.TokenCallback != nil { - setCallback(l.ctx, nil) - } - - return res, nil + return sb.String() } -// CGo only allows us to use static calls from C to Go, we can't just dynamically pass in func's. -// This is the next best thing, we register the callbacks in this map and call tokenCallback from -// the C code. We also attach a finalizer to LLama, so it will unregister the callback when the -// garbage collection frees it. +func (llm *llama) generate(tokens []C.llama_token, fn func(string)) error { + var opts C.struct_llama_sample_options + opts.repeat_penalty = C.float(llm.RepeatPenalty) + opts.frequency_penalty = C.float(llm.FrequencyPenalty) + opts.presence_penalty = C.float(llm.PresencePenalty) + opts.temperature = C.float(llm.Temperature) + opts.top_k = C.int(llm.TopK) + opts.top_p = C.float(llm.TopP) + opts.tfs_z = C.float(llm.TFSZ) + opts.typical_p = C.float(llm.TypicalP) + opts.mirostat = C.int(llm.Mirostat) + opts.mirostat_tau = C.float(llm.MirostatTau) + opts.mirostat_eta = C.float(llm.MirostatEta) -// SetTokenCallback registers a callback for the individual tokens created when running Predict. It -// will be called once for each token. The callback shall return true as long as the model should -// continue predicting the next token. When the callback returns false the predictor will return. -// The tokens are just converted into Go strings, they are not trimmed or otherwise changed. Also -// the tokens may not be valid UTF-8. -// Pass in nil to remove a callback. -// -// It is save to call this method while a prediction is running. -func (l *LLama) SetTokenCallback(callback func(token string) bool) { - setCallback(l.ctx, callback) -} + pastTokens := deque[C.llama_token]{capacity: llm.RepeatLastN} -var ( - m sync.Mutex - callbacks = map[uintptr]func(string) bool{} -) + for C.llama_get_kv_cache_token_count(llm.ctx) < C.int(llm.NumCtx) { + if retval := C.llama_eval(llm.ctx, unsafe.SliceData(tokens), C.int(len(tokens)), C.llama_get_kv_cache_token_count(llm.ctx), C.int(llm.NumThread)); retval != 0 { + return errors.New("llama: eval") + } -//export tokenCallback -func tokenCallback(statePtr unsafe.Pointer, token *C.char) bool { - m.Lock() - defer m.Unlock() + token, err := llm.sample(pastTokens, &opts) + switch { + case err != nil: + return err + case errors.Is(err, io.EOF): + return nil + } - if callback, ok := callbacks[uintptr(statePtr)]; ok { - return callback(C.GoString(token)) + fn(llm.detokenize(token)) + + tokens = []C.llama_token{token} + + pastTokens.PushLeft(token) } - return true + return nil } -// setCallback can be used to register a token callback for LLama. Pass in a nil callback to -// remove the callback. -func setCallback(statePtr unsafe.Pointer, callback func(string) bool) { - m.Lock() - defer m.Unlock() +func (llm *llama) sample(pastTokens deque[C.llama_token], opts *C.struct_llama_sample_options) (C.llama_token, error) { + numVocab := int(C.llama_n_vocab(llm.ctx)) + logits := unsafe.Slice(C.llama_get_logits(llm.ctx), numVocab) - if callback == nil { - delete(callbacks, uintptr(statePtr)) - } else { - callbacks[uintptr(statePtr)] = callback + candidates := make([]C.struct_llama_token_data, 0, numVocab) + for i := 0; i < numVocab; i++ { + candidates = append(candidates, C.llama_token_data{ + id: C.int(i), + logit: logits[i], + p: 0, + }) } + + token := C.llama_sample( + llm.ctx, + unsafe.SliceData(candidates), C.ulong(len(candidates)), + unsafe.SliceData(pastTokens.Data()), C.ulong(pastTokens.Len()), + opts) + if token != C.llama_token_eos() { + return token, nil + } + + return 0, io.EOF } diff --git a/llama/llama_cublas.go b/llama/llama_cublas.go deleted file mode 100644 index efd15192..00000000 --- a/llama/llama_cublas.go +++ /dev/null @@ -1,9 +0,0 @@ -//go:build cublas -// +build cublas - -package llama - -/* -#cgo LDFLAGS: -lcublas -lcudart -L/usr/local/cuda/lib64/ -*/ -import "C" diff --git a/llama/llama_metal.go b/llama/llama_metal.go deleted file mode 100644 index 3758ef9b..00000000 --- a/llama/llama_metal.go +++ /dev/null @@ -1,2 +0,0 @@ -//go:build metal -package llama diff --git a/llama/llama_openblas.go b/llama/llama_openblas.go deleted file mode 100644 index 31e09f7e..00000000 --- a/llama/llama_openblas.go +++ /dev/null @@ -1,9 +0,0 @@ -//go:build openblas -// +build openblas - -package llama - -/* -#cgo LDFLAGS: -lopenblas -*/ -import "C" diff --git a/llama/options.go b/llama/options.go deleted file mode 100644 index 92a9216b..00000000 --- a/llama/options.go +++ /dev/null @@ -1,98 +0,0 @@ -// MIT License - -// Copyright (c) 2023 go-skynet authors - -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -package llama - -type ModelOptions struct { - ContextSize int - Seed int - NBatch int - F16Memory bool - MLock bool - MMap bool - VocabOnly bool - LowVRAM bool - Embeddings bool - NUMA bool - NGPULayers int - MainGPU string - TensorSplit string -} - -type PredictOptions struct { - Seed, Threads, Tokens, TopK, Repeat, Batch, NKeep int - TopP, Temperature, Penalty float64 - F16KV bool - DebugMode bool - StopPrompts []string - IgnoreEOS bool - - TailFreeSamplingZ float64 - TypicalP float64 - FrequencyPenalty float64 - PresencePenalty float64 - Mirostat int - MirostatETA float64 - MirostatTAU float64 - PenalizeNL bool - LogitBias string - TokenCallback func(string) bool - - MLock, MMap bool - MainGPU string - TensorSplit string -} - -type PredictOption func(p *PredictOptions) - -type ModelOption func(p *ModelOptions) - -var DefaultModelOptions ModelOptions = ModelOptions{ - ContextSize: 512, - Seed: 0, - F16Memory: false, - MLock: false, - Embeddings: false, - MMap: true, - LowVRAM: false, -} - -var DefaultOptions PredictOptions = PredictOptions{ - Seed: -1, - Threads: 4, - Tokens: 128, - Penalty: 1.1, - Repeat: 64, - Batch: 512, - NKeep: 64, - TopK: 40, - TopP: 0.95, - TailFreeSamplingZ: 1.0, - TypicalP: 1.0, - Temperature: 0.8, - FrequencyPenalty: 0.0, - PresencePenalty: 0.0, - Mirostat: 0, - MirostatTAU: 5.0, - MirostatETA: 0.1, - MMap: true, -} diff --git a/llama/utils.go b/llama/utils.go new file mode 100644 index 00000000..b0db27d4 --- /dev/null +++ b/llama/utils.go @@ -0,0 +1,104 @@ +package llama + +type node[T any] struct { + t T + next *node[T] + prev *node[T] +} + +type deque[T any] struct { + head *node[T] + tail *node[T] + size int + capacity int +} + +func (d *deque[T]) Empty() bool { + return d.size == 0 +} + +func (d *deque[T]) Len() int { + return d.size +} + +func (d *deque[T]) Cap() int { + return d.capacity +} + +func (d *deque[T]) Push(t T) { + if d.capacity > 0 && d.size >= d.capacity { + d.PopLeft() + } + + n := node[T]{t: t} + if d.head != nil { + n.next = d.head + d.head.prev = &n + d.head = &n + } else { + d.head = &n + d.tail = &n + } + + d.size++ +} + +func (d *deque[T]) PushLeft(t T) { + if d.capacity > 0 && d.size >= d.capacity { + d.Pop() + } + + n := node[T]{t: t} + if d.tail != nil { + n.prev = d.tail + d.tail.next = &n + d.tail = &n + } else { + d.head = &n + d.tail = &n + } + + d.size++ +} + +func (d *deque[T]) Pop() *T { + if d.Empty() { + return nil + } + + head := d.head + d.head = head.next + if d.head != nil { + d.head.prev = nil + } else { + d.tail = nil + } + + d.size-- + return &head.t +} + +func (d *deque[T]) PopLeft() *T { + if d.Empty() { + return nil + } + + tail := d.tail + d.tail = tail.prev + if d.tail != nil { + d.tail.next = nil + } else { + d.head = nil + } + + d.size-- + return &tail.t +} + +func (d *deque[T]) Data() (data []T) { + for n := d.head; n != nil; n = n.next { + data = append(data, n.t) + } + + return data +} diff --git a/server/routes.go b/server/routes.go index dcd48b07..47551f15 100644 --- a/server/routes.go +++ b/server/routes.go @@ -11,12 +11,12 @@ import ( "net/http" "os" "path" - "runtime" "strings" "text/template" "github.com/gin-gonic/gin" "github.com/lithammer/fuzzysearch/fuzzy" + "golang.org/x/sync/errgroup" "github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/llama" @@ -36,14 +36,10 @@ func cacheDir() string { } func generate(c *gin.Context) { - var req api.GenerateRequest - if req.ModelOptions == nil { - req.ModelOptions = &api.DefaultModelOptions + req := api.GenerateRequest{ + Options: api.DefaultOptions(), } - if req.PredictOptions == nil { - req.PredictOptions = &api.DefaultPredictOptions - } if err := c.ShouldBindJSON(&req); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) return @@ -60,15 +56,12 @@ func generate(c *gin.Context) { req.Model = path.Join(cacheDir(), "models", req.Model+".bin") } - modelOpts := getModelOpts(req) - modelOpts.NGPULayers = 1 // hard-code this for now - - model, err := llama.New(req.Model, modelOpts) + llm, err := llama.New(req.Model, req.Options) if err != nil { - c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } - defer model.Free() + defer llm.Close() templateNames := make([]string, 0, len(templates.Templates())) for _, template := range templates.Templates() { @@ -87,43 +80,41 @@ func generate(c *gin.Context) { } ch := make(chan string) - model.SetTokenCallback(func(token string) bool { - ch <- token - return true - }) - - predictOpts := getPredictOpts(req) - - go func() { + g, _ := errgroup.WithContext(c.Request.Context()) + g.Go(func() error { defer close(ch) - _, err := model.Predict(req.Prompt, predictOpts) - if err != nil { - panic(err) - } - }() - - c.Stream(func(w io.Writer) bool { - token, ok := <-ch - if !ok { - return false - } - - resp := api.GenerateResponse{ - Response: token, - } - - bts, err := json.Marshal(resp) - if err != nil { - return false - } - - bts = append(bts, '\n') - if _, err := w.Write(bts); err != nil { - return false - } - - return true + return llm.Predict(req.Prompt, func(s string) { + ch <- s + }) }) + + g.Go(func() error { + c.Stream(func(w io.Writer) bool { + s, ok := <-ch + if !ok { + return false + } + + bts, err := json.Marshal(api.GenerateResponse{Response: s}) + if err != nil { + return false + } + + bts = append(bts, '\n') + if _, err := w.Write(bts); err != nil { + return false + } + + return true + }) + + return nil + }) + + if err := g.Wait(); err != nil && !errors.Is(err, io.EOF) { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } } func Serve(ln net.Listener) error { @@ -195,53 +186,3 @@ func matchRankOne(source string, targets []string) (bestMatch string, bestRank i return } - -func getModelOpts(req api.GenerateRequest) llama.ModelOptions { - var opts llama.ModelOptions - opts.ContextSize = req.ModelOptions.ContextSize - opts.Seed = req.ModelOptions.Seed - opts.F16Memory = req.ModelOptions.F16Memory - opts.MLock = req.ModelOptions.MLock - opts.Embeddings = req.ModelOptions.Embeddings - opts.MMap = req.ModelOptions.MMap - opts.LowVRAM = req.ModelOptions.LowVRAM - - opts.NBatch = req.ModelOptions.NBatch - opts.VocabOnly = req.ModelOptions.VocabOnly - opts.NUMA = req.ModelOptions.NUMA - opts.NGPULayers = req.ModelOptions.NGPULayers - opts.MainGPU = req.ModelOptions.MainGPU - opts.TensorSplit = req.ModelOptions.TensorSplit - - return opts -} - -func getPredictOpts(req api.GenerateRequest) llama.PredictOptions { - var opts llama.PredictOptions - - if req.PredictOptions.Threads == -1 { - opts.Threads = runtime.NumCPU() - } else { - opts.Threads = req.PredictOptions.Threads - } - - opts.Seed = req.PredictOptions.Seed - opts.Tokens = req.PredictOptions.Tokens - opts.Penalty = req.PredictOptions.Penalty - opts.Repeat = req.PredictOptions.Repeat - opts.Batch = req.PredictOptions.Batch - opts.NKeep = req.PredictOptions.NKeep - opts.TopK = req.PredictOptions.TopK - opts.TopP = req.PredictOptions.TopP - opts.TailFreeSamplingZ = req.PredictOptions.TailFreeSamplingZ - opts.TypicalP = req.PredictOptions.TypicalP - opts.Temperature = req.PredictOptions.Temperature - opts.FrequencyPenalty = req.PredictOptions.FrequencyPenalty - opts.PresencePenalty = req.PredictOptions.PresencePenalty - opts.Mirostat = req.PredictOptions.Mirostat - opts.MirostatTAU = req.PredictOptions.MirostatTAU - opts.MirostatETA = req.PredictOptions.MirostatETA - opts.MMap = req.PredictOptions.MMap - - return opts -}