feat: Update llama.cpp

2024-04-21 20:46:40 -04:00 · 2024-04-21 20:46:40 -04:00 · 159cc4e5d9
commit 159cc4e5d9
parent 893a27a736
3 changed files with 33 additions and 11 deletions
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@ -181,20 +181,20 @@ class _LlamaModel:
                )
        return list(tokens[:n_tokens])

-    def token_to_piece(self, token: int) -> bytes:
+    def token_to_piece(self, token: int, special: bool = False) -> bytes:
        assert self.model is not None
        buf = ctypes.create_string_buffer(32)
-        llama_cpp.llama_token_to_piece(self.model, token, buf, 32)
+        llama_cpp.llama_token_to_piece(self.model, token, buf, 32, special)
        return bytes(buf)

-    def detokenize(self, tokens: List[int]) -> bytes:
+    def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
        assert self.model is not None
        output = b""
        size = 32
        buffer = (ctypes.c_char * size)()
        for token in tokens:
            n = llama_cpp.llama_token_to_piece(
-                self.model, llama_cpp.llama_token(token), buffer, size
+                self.model, llama_cpp.llama_token(token), buffer, size, special
            )
            assert n <= size
            output += bytes(buffer[:n])
@ -597,13 +597,13 @@ def _tokenize(model: _LlamaModel, text: str, add_bos: bool, special: bool) -> li
    return list(result)


-def _token_to_piece(model: _LlamaModel, token: int) -> str:
+def _token_to_piece(model: _LlamaModel, token: int, special: bool = False) -> str:
    assert model.model is not None
    result = (ctypes.c_char * 8)(0)
-    n_tokens = llama_cpp.llama_token_to_piece(model.model, token, result, len(result))
+    n_tokens = llama_cpp.llama_token_to_piece(model.model, token, result, len(result), special)
    if n_tokens < 0:
        result = (ctypes.c_char * -n_tokens)(0)
-        check = llama_cpp.llama_token_to_piece(model.model, token, result, len(result))
+        check = llama_cpp.llama_token_to_piece(model.model, token, result, len(result), special)
        if check != -n_tokens:
            raise RuntimeError(f"Failed to get piece: token={token}")
    else:
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -2380,6 +2380,18 @@ def llama_token_get_type(
    ...


+# // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
+# LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
+@ctypes_function(
+    "llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool
+)
+def llama_token_is_eog(
+    model: llama_model_p, token: Union[llama_token, int], /
+) -> bool:
+    """Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)"""
+    ...
+
+
 # // Special tokens


@ -2434,7 +2446,7 @@ def llama_add_eos_token(model: llama_model_p, /) -> int:
    ...


-# // codellama infill tokens
+# // Codellama infill tokens
 # LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token)
 def llama_token_prefix(model: llama_model_p) -> int:
@ -2524,11 +2536,13 @@ def llama_tokenize(
 # // Uses the vocabulary in the provided context.
 # // Does not write null terminator to the buffer.
 # // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
+# // @param special If true, special tokens are rendered in the output.
 # LLAMA_API int32_t llama_token_to_piece(
 #           const struct llama_model * model,
 #                        llama_token   token,
 #                               char * buf,
-#                            int32_t   length);
+#                            int32_t   length,
+#                               bool   special);
@ctypes_function(
    "llama_token_to_piece",
    [
@ -2536,6 +2550,7 @@ def llama_tokenize(
        llama_token,
        ctypes.c_char_p,
        ctypes.c_int32,
+        ctypes.c_bool,
    ],
    ctypes.c_int32,
 )
@ -2544,13 +2559,20 @@ def llama_token_to_piece(
    token: Union[llama_token, int],
    buf: Union[ctypes.c_char_p, bytes, CtypesArray[ctypes.c_char]],
    length: Union[ctypes.c_int, int],
+    special: Union[ctypes.c_bool, bool],
    /,
 ) -> int:
    """Token Id -> Piece.
    Uses the vocabulary in the provided context.
    Does not write null terminator to the buffer.
    User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
-    """
+
+    Args:
+        model: The model to use for tokenization.
+        token: The token to convert.
+        buf: The buffer to write the token to.
+        length: The length of the buffer.
+        special: If true, special tokens are rendered in the output."""
    ...


--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 3b8f1ec4b18770531d0b1d792f3edf08254e4f0c
+Subproject commit 5cf5e7d490dfdd2e70bface2d35dfd14aa44b4fb