Merge https://github.com/abetlen/llama-cpp-python

feat: Update llama.cpp
2024-06-09 10:46:41 +05:30 · 2024-06-08 23:14:22 -04:00 · 2024-06-07 02:02:12 -04:00 · 2024-06-04 10:38:21 -04:00 · 2024-06-04 10:18:38 -04:00 · 2024-06-04 10:15:41 -04:00
13 changed files with 123 additions and 141 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ## [0.2.77]
 - feat: Update llama.cpp to ggerganov/llama.cpp@bde7cd3cd949c1a85d3a199498ac98e78039d46f
 - fix: string value kv_overrides by @abetlen in df45a4b3fe46e72664bda87301b318210c6d4782
 - fix: Fix typo in Llama3VisionAlphaChatHandler by @abetlen in 165b4dc6c188f8fda2fc616154e111f710484eba
 - fix: Use numpy recarray for candidates data, fixes bug with temp < 0 by @abetlen in af3ed503e9ce60fe6b5365031abad4176a3536b3
 fix: Disable Windows+CUDA workaround when compiling for HIPBLAS by Engininja2 in #1493
 ## [0.2.76]
 - feat: Update llama.cpp to ggerganov/llama.cpp@0df0aa8e43c3378975269a51f9b876c8692e70da
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -41,14 +41,16 @@ if (LLAMA_BUILD)
        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
    )
    # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
-    install(
+    if (WIN32 AND (LLAMA_CUDA OR LLAMA_CUBLAS))
-        FILES $<TARGET_RUNTIME_DLLS:llama>
+        install(
-        DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+            FILES $<TARGET_RUNTIME_DLLS:llama>
-    )
+            DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-    install(
+        )
-        FILES $<TARGET_RUNTIME_DLLS:llama>
+        install(
-        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+            FILES $<TARGET_RUNTIME_DLLS:llama>
-    )
+            DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        )
    endif()
    if (LLAVA_BUILD)
        if (LLAMA_CUBLAS OR LLAMA_CUDA)
--- a/3
+++ b/3
@ -45,6 +45,9 @@ build.kompute:
 build.sycl:
 	CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e .
 build.rpc:
 	CMAKE_ARGS="-DLLAMA_RPC=on" python3 -m pip install --verbose -e .
 build.sdist:
 	python3 -m build --sdist
--- a/README.md
+++ b/README.md
@ -221,6 +221,17 @@ CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pi
 ```
 </details>
 <details>
 <summary>RPC</summary>
 To install with RPC support, set the `LLAMA_RPC=on` environment variable before installing:
 ```bash
 source /opt/intel/oneapi/setvars.sh   
 CMAKE_ARGS="-DLLAMA_RPC=on" pip install llama-cpp-python
 ```
 </details>
 ### Windows Notes
--- a/llama_cpp/init.py
+++ b/llama_cpp/init.py
@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.76"
+__version__ = "0.2.77"
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@ -128,9 +128,9 @@ class _LlamaModel:
        assert self.model is not None
        return llama_cpp.llama_token_get_score(self.model, token)
-    def token_get_type(self, token: int) -> int:
+    def token_get_attr(self, token: int) -> int:
        assert self.model is not None
-        return llama_cpp.llama_token_get_type(self.model, token)
+        return llama_cpp.llama_token_get_attr(self.model, token)
    # Special tokens
@ -142,6 +142,14 @@ class _LlamaModel:
        assert self.model is not None
        return llama_cpp.llama_token_eos(self.model)
    def token_cls(self) -> int:
        assert self.model is not None
        return llama_cpp.llama_token_cls(self.model)
    def token_sep(self) -> int:
        assert self.model is not None
        return llama_cpp.llama_token_sep(self.model)
    def token_nl(self) -> int:
        assert self.model is not None
        return llama_cpp.llama_token_nl(self.model)
@ -545,13 +553,12 @@ class _LlamaBatch:
 class _LlamaTokenDataArray:
    def __init__(self, *, n_vocab: int):
        self.n_vocab = n_vocab
-        self.candidates_data = np.array(
+        self.candidates_data = np.recarray(
-            [],
+            (self.n_vocab,),
            dtype=np.dtype(
                [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
            ),
        )
        self.candidates_data.resize(3, self.n_vocab, refcheck=False)
        self.candidates = llama_cpp.llama_token_data_array(
            data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
            size=self.n_vocab,
@ -561,14 +568,11 @@ class _LlamaTokenDataArray:
        self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
    def copy_logits(self, logits: npt.NDArray[np.single]):
-        self.candidates_data["id"][:] = self.default_candidates_data_id
+        self.candidates_data.id[:] = self.default_candidates_data_id
-        self.candidates_data["logit"][:] = logits
+        self.candidates_data.logit[:] = logits
-        self.candidates_data["p"][:] = self.default_candidates_data_p
+        self.candidates_data.p[:] = self.default_candidates_data_p
-        self.candidates.data = self.candidates_data.ctypes.data_as(
+        self.candidates.sorted = False
-            llama_cpp.llama_token_data_p
+        self.candidates.size = self.n_vocab
        )
        self.candidates.sorted = ctypes.c_bool(False)
        self.candidates.size = ctypes.c_size_t(self.n_vocab)
 # Python wrappers over common/common
@ -759,14 +763,14 @@ class _LlamaSamplingContext:
                    self.params.penalty_present,
                )
            if not self.params.penalize_nl:
-                token_data_array.candidates_data["logit"][nl_token] = nl_logit
+                token_data_array.candidates_data.logit[nl_token] = nl_logit
        if self.grammar is not None:
            ctx_main.sample_grammar(token_data_array, self.grammar)
        if self.params.temp < 0:
            ctx_main.sample_softmax(token_data_array)
-            id = token_data_array.candidates_data["id"][0]
+            id = token_data_array.candidates_data.id[0]
        elif self.params.temp == 0:
            id = ctx_main.sample_token_greedy(token_data_array)
        else:
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -8,6 +8,7 @@ import json
 import ctypes
 import typing
 import fnmatch
 import warnings
 import multiprocessing
 from typing import (
@ -71,6 +72,7 @@ class Llama:
        split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
        main_gpu: int = 0,
        tensor_split: Optional[List[float]] = None,
        rpc_servers: Optional[str] = None,
        vocab_only: bool = False,
        use_mmap: bool = True,
        use_mlock: bool = False,
@ -149,6 +151,7 @@ class Llama:
            split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
            main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
            tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
            rpc_servers: Comma separated list of RPC servers to use for offloading
            vocab_only: Only load the vocabulary no weights.
            use_mmap: Use mmap if possible.
            use_mlock: Force the system to keep the model in RAM.
@ -220,6 +223,11 @@ class Llama:
        )  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
        self.model_params.split_mode = split_mode
        self.model_params.main_gpu = main_gpu
        if rpc_servers is not None:
            self.model_params.rpc_servers = rpc_servers.encode('utf-8')
            self._rpc_servers = rpc_servers
        else:
            self._rpc_servers = None
        self.tensor_split = tensor_split
        self._c_tensor_split = None
        if self.tensor_split is not None:
@ -1019,6 +1027,12 @@ class Llama:
        )
        model_name: str = model if model is not None else self.model_path
        if prompt_tokens[:2] == [self.token_bos()] * 2:
            warnings.warn(
                f'Detected duplicate leading "{self._model.token_get_text(self.token_bos())}" in prompt, this will likely reduce response quality, consider removing it...',
                RuntimeWarning,
            )
        # NOTE: This likely doesn't work correctly for the first token in the prompt
        # because of the extra space added to the start of the prompt_tokens
        if logit_bias is not None:
@ -1403,8 +1417,8 @@ class Llama:
            top_logprobs: List[Optional[Dict[str, float]]] = []
            if echo:
-                # Remove leading BOS token
+                # Remove leading BOS token if exists
-                all_tokens = prompt_tokens[1:] + completion_tokens
+                all_tokens = prompt_tokens[1 if prompt_tokens[0] == self.token_bos() else 0:] + completion_tokens
            else:
                all_tokens = completion_tokens
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -160,6 +160,7 @@ class ChatFormatterResponse:
    prompt: str
    stop: Optional[Union[str, List[str]]] = None
    stopping_criteria: Optional[llama.StoppingCriteriaList] = None
    added_special: bool = False
 class ChatFormatter(Protocol):
@ -232,7 +233,7 @@ class Jinja2ChatFormatter(ChatFormatter):
                return tokens[-1] in self.stop_token_ids
            stopping_criteria = llama.StoppingCriteriaList([stop_on_last_token])
-        return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token], stopping_criteria=stopping_criteria)
+        return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token], stopping_criteria=stopping_criteria, added_special=True)
    def to_chat_handler(self) -> LlamaChatCompletionHandler:
        return chat_formatter_to_chat_completion_handler(self)
@ -548,7 +549,7 @@ def chat_formatter_to_chat_completion_handler(
            tools=tools,
            tool_choice=tool_choice,
        )
-        prompt = result.prompt
+        prompt = llama.tokenize(result.prompt.encode("utf-8"), add_bos=not result.added_special, special=True)
        if result.stop is not None:
            stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
            rstop = result.stop if isinstance(result.stop, list) else [result.stop]
@ -655,7 +656,7 @@ def hf_autotokenizer_to_chat_formatter(
        prompt: str = tokenizer.apply_chat_template(messages, tokenize=False)  # type: ignore
        assert isinstance(prompt, str)
        # Return formatted prompt and eos token by default
-        return ChatFormatterResponse(prompt=prompt, stop=tokenizer.eos_token)
+        return ChatFormatterResponse(prompt=prompt, stop=tokenizer.eos_token, added_special=True)
    return format_autotokenizer
@ -708,7 +709,7 @@ def hf_tokenizer_config_to_chat_formatter(
            bos_token=bos_token,
            eos_token=eos_token,
        )
-        return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token])
+        return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token], added_special=True)
    return format_tokenizer_config
@ -918,7 +919,7 @@ def format_llama2(
    messages: List[llama_types.ChatCompletionRequestMessage],
    **kwargs: Any,
 ) -> ChatFormatterResponse:
-    _system_template = "<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>"
+    _system_template = "[INST] <<SYS>>\n{system_message}\n<</SYS>>"
    _roles = dict(user="<s>[INST]", assistant="[/INST]")
    _messages = _map_roles(messages, _roles)
    system_message = _get_system_message(messages)
@ -940,11 +941,10 @@ def format_llama3(
        user="<|start_header_id|>user<|end_header_id|>\n\n",
        assistant="<|start_header_id|>assistant<|end_header_id|>\n\n",
    )
    _begin_token = "<|begin_of_text|>"
    _sep = "<|eot_id|>"
    _messages = _map_roles(messages, _roles)
    _messages.append((_roles["assistant"], None))
-    _prompt = _format_no_colon_single(_begin_token, _messages, _sep)
+    _prompt = _format_no_colon_single("", _messages, _sep)
    return ChatFormatterResponse(prompt=_prompt, stop=_sep)
@ -1229,10 +1229,9 @@ def format_mistral_instruct(
    messages: List[llama_types.ChatCompletionRequestMessage],
    **kwargs: Any,
 ) -> ChatFormatterResponse:
    bos = "<s>"
    eos = "</s>"
    stop = eos
-    prompt = bos
+    prompt = ""
    for message in messages:
        if (
            message["role"] == "user"
@ -2642,13 +2641,13 @@ class Llava15ChatHandler:
            if type_ == "text":
                tokens = llama.tokenize(value.encode("utf8"), add_bos=False, special=True)
                if llama.n_tokens + len(tokens) > llama.n_ctx():
-                    raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
+                    raise ValueError(f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}")
                llama.eval(tokens)
            else:
                image_bytes = self.load_image(value)
                embed = embed_image_bytes(image_bytes)
                if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
-                    raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
+                    raise ValueError(f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}")
                n_past = ctypes.c_int(llama.n_tokens)
                n_past_p = ctypes.pointer(n_past)
                with suppress_stdout_stderr(disable=self.verbose):
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -333,7 +333,7 @@ LLAMA_ROPE_TYPE_NEOX = 2
 LLAMA_ROPE_TYPE_GLM = 4
-# enum llama_token_type {
+# enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
 #     LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
 #     LLAMA_TOKEN_TYPE_NORMAL       = 1,
 #     LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
@ -351,6 +351,32 @@ LLAMA_TOKEN_TYPE_UNUSED = 5
 LLAMA_TOKEN_TYPE_BYTE = 6
 # enum llama_token_attr {
 #     LLAMA_TOKEN_ATTR_UNDEFINED    = 0,
 #     LLAMA_TOKEN_ATTR_UNKNOWN      = 1 << 0,
 #     LLAMA_TOKEN_ATTR_UNUSED       = 1 << 1,
 #     LLAMA_TOKEN_ATTR_NORMAL       = 1 << 2,
 #     LLAMA_TOKEN_ATTR_CONTROL      = 1 << 3,  // SPECIAL?
 #     LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
 #     LLAMA_TOKEN_ATTR_BYTE         = 1 << 5,
 #     LLAMA_TOKEN_ATTR_NORMALIZED   = 1 << 6,
 #     LLAMA_TOKEN_ATTR_LSTRIP       = 1 << 7,
 #     LLAMA_TOKEN_ATTR_RSTRIP       = 1 << 8,
 #     LLAMA_TOKEN_ATTR_SINGLE_WORD  = 1 << 9,
 # };
 LLAMA_TOKEN_ATTR_UNDEFINED = 0
 LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0
 LLAMA_TOKEN_ATTR_UNUSED = 1 << 1
 LLAMA_TOKEN_ATTR_NORMAL = 1 << 2
 LLAMA_TOKEN_ATTR_CONTROL = 1 << 3
 LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4
 LLAMA_TOKEN_ATTR_BYTE = 1 << 5
 LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6
 LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7
 LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8
 LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9
 # // model file types
 # enum llama_ftype {
 #     LLAMA_FTYPE_ALL_F32              = 0,
@ -959,6 +985,9 @@ llama_grammar_p = ctypes.c_void_p
 #     // modifies a preceding LLAMA_GRETYPE_CHAR or
 #     // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
 #     LLAMA_GRETYPE_CHAR_ALT       = 6,
 #     // any character (.)
 #     LLAMA_GRETYPE_CHAR_ANY       = 7,
 # };
 LLAMA_GRETYPE_END = 0
 LLAMA_GRETYPE_ALT = 1
@ -967,6 +996,7 @@ LLAMA_GRETYPE_CHAR = 3
 LLAMA_GRETYPE_CHAR_NOT = 4
 LLAMA_GRETYPE_CHAR_RNG_UPPER = 5
 LLAMA_GRETYPE_CHAR_ALT = 6
 LLAMA_GRETYPE_CHAR_ANY = 7
 # typedef struct llama_grammar_element {
@ -1233,12 +1263,12 @@ def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
 def llama_pooling_type(ctx: llama_context_p, /) -> int: ...
-# LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model   * model);
+# LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int)
 def llama_vocab_type(model: llama_model_p, /) -> int: ...
-# LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model   * model);
+# LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
@ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int)
 def llama_rope_type(model: llama_model_p, /) -> int: ...
@ -2438,11 +2468,11 @@ def llama_token_get_score(
 ) -> float: ...
-# LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
+# LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
@ctypes_function(
-    "llama_token_get_type", [llama_model_p_ctypes, llama_token], ctypes.c_int
+    "llama_token_get_attr", [llama_model_p_ctypes, llama_token], ctypes.c_int
 )
-def llama_token_get_type(
+def llama_token_get_attr(
    model: llama_model_p, token: Union[llama_token, int], /
 ) -> int: ...
@ -3200,104 +3230,9 @@ def llama_grammar_accept_token(
 # //
-# // Beam search
+# // Model split
 # //
 # struct llama_beam_view {
 #     const llama_token * tokens;
 #     size_t n_tokens;
 #     float  p;        // Cumulative beam probability (renormalized relative to all beams)
 #     bool   eob;      // Callback should set this to true when a beam is at end-of-beam.
 # };
 class llama_beam_view(ctypes.Structure):
    if TYPE_CHECKING:
        tokens: CtypesArray[llama_token]
        n_tokens: int
        p: float
        eob: bool
    _fields_ = [
        ("tokens", llama_token_p),
        ("n_tokens", ctypes.c_size_t),
        ("p", ctypes.c_float),
        ("eob", ctypes.c_bool),
    ]
 # // Passed to beam_search_callback function.
 # // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
 # // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
 # // These pointers are valid only during the synchronous callback, so should not be saved.
 # struct llama_beams_state {
 #     struct llama_beam_view * beam_views;
 #     size_t n_beams;               // Number of elements in beam_views[].
 #     size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
 #     bool   last_call;             // True iff this is the last callback invocation.
 # };
 class llama_beams_state(ctypes.Structure):
    if TYPE_CHECKING:
        beam_views: CtypesArray[llama_beam_view]
        n_beams: int
        common_prefix_length: int
        last_call: bool
    _fields_ = [
        ("beam_views", ctypes.POINTER(llama_beam_view)),
        ("n_beams", ctypes.c_size_t),
        ("common_prefix_length", ctypes.c_size_t),
        ("last_call", ctypes.c_bool),
    ]
 # // Type of pointer to the beam_search_callback function.
 # // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
 # // passed back to beam_search_callback. This avoids having to use global variables in the callback.
 # typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
 llama_beam_search_callback_fn_t = ctypes.CFUNCTYPE(
    None, ctypes.c_void_p, llama_beams_state
 )
 # /// @details Deterministically returns entire sentence constructed by a beam search.
 # /// @param ctx Pointer to the llama_context.
 # /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
 # /// @param callback_data A pointer that is simply passed back to callback.
 # /// @param n_beams Number of beams to use.
 # /// @param n_past Number of tokens already evaluated.
 # /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
 # /// @param n_threads Number of threads as passed to llama_eval().
 # LLAMA_API void llama_beam_search(
 #                struct llama_context * ctx,
 #     llama_beam_search_callback_fn_t   callback,
 #                                void * callback_data,
 #                              size_t   n_beams,
 #                             int32_t   n_past,
 #                             int32_t   n_predict);
@ctypes_function(
    "llama_beam_search",
    [
        llama_context_p_ctypes,
        llama_beam_search_callback_fn_t,
        ctypes.c_void_p,
        ctypes.c_size_t,
        ctypes.c_int32,
        ctypes.c_int32,
    ],
    None,
 )
 def llama_beam_search(
    ctx: llama_context_p,
    callback: CtypesFuncPointer,
    callback_data: ctypes.c_void_p,
    n_beams: Union[ctypes.c_size_t, int],
    n_past: Union[ctypes.c_int, int],
    n_predict: Union[ctypes.c_int, int],
    /,
 ): ...
 # /// @details Build a split GGUF final path for this chunk.
 # ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
 # //  Returns the split_path length.
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@ -226,6 +226,7 @@ class LlamaProxy:
            use_mmap=settings.use_mmap,
            use_mlock=settings.use_mlock,
            kv_overrides=kv_overrides,
            rpc_servers=settings.rpc_servers,
            # Context Params
            seed=settings.seed,
            n_ctx=settings.n_ctx,
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@ -58,6 +58,10 @@ class ModelSettings(BaseSettings):
        default=None,
        description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
    )
    rpc_servers: Optional[str] = Field(
        default=None,
        description="comma seperated list of rpc servers for offloading",
    )
    # Context Params
    seed: int = Field(
        default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
--- a/tests/test_llama_chat_format.py
+++ b/tests/test_llama_chat_format.py
@ -21,12 +21,13 @@ def test_mistral_instruct():
    response = llama_chat_format.format_mistral_instruct(
        messages=messages,
    )
    prompt = ("" if response.added_special else "<s>") + response.prompt
    reference = chat_formatter.render(
        messages=messages,
        bos_token="<s>",
        eos_token="</s>",
    )
-    assert response.prompt == reference
+    assert prompt == reference
 mistral_7b_tokenizer_config = """{
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 504f0c340f6b5e04de682f6ddefdd3b81208df5d
+Subproject commit 5795b941827fdec6c1662986de962badff456718
Author	SHA1	Message	Date
baalajimaestro	c2e4d5820a	Merge https://github.com/abetlen/llama-cpp-python	2024-06-09 10:46:41 +05:30
Andrei Betlen	83d6b26e6f	feat: Update llama.cpp	2024-06-08 23:14:22 -04:00
Andrei Betlen	255e1b4495	feat: Update llama.cpp	2024-06-07 02:02:12 -04:00
nullname	d634efcdd9	feat: adding `rpc_servers` parameter to `Llama` class (#1477 ) * passthru rpc_servers params wip * enable llama rpc by default * convert string to byte * add rpc package * Revert "enable llama rpc by default" This reverts commit 832c6dd56c979514cec5df224bf2d2014dccd790. * update readme * Only set rpc_servers when provided * Add rpc servers to server options --------- Co-authored-by: Andrei Betlen <abetlen@gmail.com>	2024-06-04 10:38:21 -04:00
Asghar Ghorbani	6e0642ca19	fix: fix logprobs when BOS is not present (#1471 ) * Fix lobprobs when BOS is not present * Fix logprobs when bos is not available	2024-06-04 10:18:38 -04:00
Sigbjørn Skjæret	027f7bc678	fix: Avoid duplicate special tokens in chat formats (#1439 ) * Templates sometimes have BOS in them, remove duplicate * tokenize chat format prompts before completion This is to ensure that we don't duplicate any special tokens. Hopefully I amended the existing formats correctly? * updated comment * corrected a few * add some missing internals * proper bos/eos detection * just let tokenizer do the job * typo-- * align test with new response * changed to a warning * move to another PR * Use python warnings module --------- Co-authored-by: Andrei Betlen <abetlen@gmail.com>	2024-06-04 10:15:41 -04:00
Andrei Betlen	951e39caf9	Merge branch 'main' of https://github.com/abetlen/llama-cpp-python into main	2024-06-04 00:49:26 -04:00
Andrei Betlen	c3ef41ba06	chore: Bump version	2024-06-04 00:49:24 -04:00
Engininja2	ae5682f500	fix: Disable Windows+CUDA workaround when compiling for HIPBLAS (#1493 ) * Disable Windows+CUDA workaround when compiling for HIPBLAS * fix spacing * change condition to check for Windows & CUDA Co-authored-by: Andrei <abetlen@gmail.com> --------- Co-authored-by: Andrei <abetlen@gmail.com>	2024-06-04 00:42:34 -04:00
Andrei Betlen	cd3f1bb387	feat: Update llama.cpp	2024-06-04 00:35:47 -04:00
Andrei Betlen	6b018e00b1	misc: Improve llava error messages	2024-06-03 11:19:10 -04:00
Andrei Betlen	a6457ba74b	Merge branch 'main' of https://github.com/abetlen/llama-cpp-python into main	2024-06-01 18:10:13 -04:00
Andrei Betlen	af3ed503e9	fix: Use numpy recarray for candidates data, fixes bug with temp < 0	2024-06-01 18:09:24 -04:00
		`@ -1 +1 @@`
			`Subproject commit 504f0c340f6b5e04de682f6ddefdd3b81208df5d`				`Subproject commit 5795b941827fdec6c1662986de962badff456718`