diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 79f6543..ff2d657 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -181,20 +181,20 @@ class _LlamaModel: ) return list(tokens[:n_tokens]) - def token_to_piece(self, token: int) -> bytes: + def token_to_piece(self, token: int, special: bool = False) -> bytes: assert self.model is not None buf = ctypes.create_string_buffer(32) - llama_cpp.llama_token_to_piece(self.model, token, buf, 32) + llama_cpp.llama_token_to_piece(self.model, token, buf, 32, special) return bytes(buf) - def detokenize(self, tokens: List[int]) -> bytes: + def detokenize(self, tokens: List[int], special: bool = False) -> bytes: assert self.model is not None output = b"" size = 32 buffer = (ctypes.c_char * size)() for token in tokens: n = llama_cpp.llama_token_to_piece( - self.model, llama_cpp.llama_token(token), buffer, size + self.model, llama_cpp.llama_token(token), buffer, size, special ) assert n <= size output += bytes(buffer[:n]) @@ -597,13 +597,13 @@ def _tokenize(model: _LlamaModel, text: str, add_bos: bool, special: bool) -> li return list(result) -def _token_to_piece(model: _LlamaModel, token: int) -> str: +def _token_to_piece(model: _LlamaModel, token: int, special: bool = False) -> str: assert model.model is not None result = (ctypes.c_char * 8)(0) - n_tokens = llama_cpp.llama_token_to_piece(model.model, token, result, len(result)) + n_tokens = llama_cpp.llama_token_to_piece(model.model, token, result, len(result), special) if n_tokens < 0: result = (ctypes.c_char * -n_tokens)(0) - check = llama_cpp.llama_token_to_piece(model.model, token, result, len(result)) + check = llama_cpp.llama_token_to_piece(model.model, token, result, len(result), special) if check != -n_tokens: raise RuntimeError(f"Failed to get piece: token={token}") else: diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 2450d11..c2b909e 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2380,6 +2380,18 @@ def llama_token_get_type( ... +# // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) +# LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token); +@ctypes_function( + "llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool +) +def llama_token_is_eog( + model: llama_model_p, token: Union[llama_token, int], / +) -> bool: + """Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)""" + ... + + # // Special tokens @@ -2434,7 +2446,7 @@ def llama_add_eos_token(model: llama_model_p, /) -> int: ... -# // codellama infill tokens +# // Codellama infill tokens # LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix @ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token) def llama_token_prefix(model: llama_model_p) -> int: @@ -2524,11 +2536,13 @@ def llama_tokenize( # // Uses the vocabulary in the provided context. # // Does not write null terminator to the buffer. # // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. +# // @param special If true, special tokens are rendered in the output. # LLAMA_API int32_t llama_token_to_piece( # const struct llama_model * model, # llama_token token, # char * buf, -# int32_t length); +# int32_t length, +# bool special); @ctypes_function( "llama_token_to_piece", [ @@ -2536,6 +2550,7 @@ def llama_tokenize( llama_token, ctypes.c_char_p, ctypes.c_int32, + ctypes.c_bool, ], ctypes.c_int32, ) @@ -2544,13 +2559,20 @@ def llama_token_to_piece( token: Union[llama_token, int], buf: Union[ctypes.c_char_p, bytes, CtypesArray[ctypes.c_char]], length: Union[ctypes.c_int, int], + special: Union[ctypes.c_bool, bool], /, ) -> int: """Token Id -> Piece. Uses the vocabulary in the provided context. Does not write null terminator to the buffer. User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. - """ + + Args: + model: The model to use for tokenization. + token: The token to convert. + buf: The buffer to write the token to. + length: The length of the buffer. + special: If true, special tokens are rendered in the output.""" ... diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 3b8f1ec..5cf5e7d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 3b8f1ec4b18770531d0b1d792f3edf08254e4f0c +Subproject commit 5cf5e7d490dfdd2e70bface2d35dfd14aa44b4fb