From 5502ac8876047c231a86fdaa1ee5f3dae6a9b3a5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 15 Jan 2024 10:12:10 -0500 Subject: [PATCH] Update llama.cpp --- llama_cpp/llama_cpp.py | 43 ++++++++++++++++++++++++++++++++++-------- vendor/llama.cpp | 2 +- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 3b261cd..9e8e3ce 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -229,6 +229,7 @@ LLAMA_SPLIT_NONE = 0 LLAMA_SPLIT_LAYER = 1 LLAMA_SPLIT_ROW = 2 + # typedef struct llama_token_data { # llama_token id; // token id # float logit; // log-odds of the token @@ -395,6 +396,7 @@ class llama_model_kv_override(Structure): # // override key-value pairs of the model meta data # const struct llama_model_kv_override * kv_overrides; + # // Keep the booleans together to avoid misalignment during copy-by-value. # bool vocab_only; // only load the vocabulary, no weights # bool use_mmap; // use mmap if possible @@ -407,7 +409,7 @@ class llama_model_params(Structure): n_gpu_layers (int): number of layers to store in VRAM split_mode (int): how to split the model across multiple GPUs main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored - tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES + tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted. progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data @@ -1960,14 +1962,39 @@ _lib.llama_sample_repetition_penalties.restype = None # /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 -# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. -# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. -# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. -# LLAMA_API void llama_sample_classifier_free_guidance( -# struct llama_context * ctx, +# /// @param logits Logits extracted from the original generation context. +# /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. +# /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. +# LLAMA_API void llama_sample_apply_guidance( +# struct llama_context * ctx, +# float * logits, +# float * logits_guidance, +# float scale); +def llama_sample_apply_guidance( + ctx: llama_context_p, + logits, # type: _Pointer[c_float] + logits_guidance, # type: _Pointer[c_float] + scale: Union[c_float, float], +): + """Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806""" + return _lib.llama_sample_apply_guidance(ctx, logits, logits_guidance, scale) + + +_lib.llama_sample_apply_guidance.argtypes = [ + llama_context_p, + c_float_p, + c_float_p, + c_float, +] +_lib.llama_sample_apply_guidance.restype = None + + +# LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance( +# struct llama_context * ctx, # llama_token_data_array * candidates, -# struct llama_context * guidance_ctx, -# float scale); +# struct llama_context * guidance_ctx, +# float scale), +# "use llama_sample_apply_guidance() instead"); def llama_sample_classifier_free_guidance( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] diff --git a/vendor/llama.cpp b/vendor/llama.cpp index bb0c139..4483396 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit bb0c1392479398f9aba86d9ec98db0b95ede6e6d +Subproject commit 4483396751c79dea540808b9cb9238245d06da2b