feat: Update llama.cpp

This commit is contained in:
Andrei Betlen 2024-04-21 20:46:40 -04:00
parent 893a27a736
commit 159cc4e5d9
3 changed files with 33 additions and 11 deletions

View file

@ -181,20 +181,20 @@ class _LlamaModel:
) )
return list(tokens[:n_tokens]) return list(tokens[:n_tokens])
def token_to_piece(self, token: int) -> bytes: def token_to_piece(self, token: int, special: bool = False) -> bytes:
assert self.model is not None assert self.model is not None
buf = ctypes.create_string_buffer(32) buf = ctypes.create_string_buffer(32)
llama_cpp.llama_token_to_piece(self.model, token, buf, 32) llama_cpp.llama_token_to_piece(self.model, token, buf, 32, special)
return bytes(buf) return bytes(buf)
def detokenize(self, tokens: List[int]) -> bytes: def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
assert self.model is not None assert self.model is not None
output = b"" output = b""
size = 32 size = 32
buffer = (ctypes.c_char * size)() buffer = (ctypes.c_char * size)()
for token in tokens: for token in tokens:
n = llama_cpp.llama_token_to_piece( n = llama_cpp.llama_token_to_piece(
self.model, llama_cpp.llama_token(token), buffer, size self.model, llama_cpp.llama_token(token), buffer, size, special
) )
assert n <= size assert n <= size
output += bytes(buffer[:n]) output += bytes(buffer[:n])
@ -597,13 +597,13 @@ def _tokenize(model: _LlamaModel, text: str, add_bos: bool, special: bool) -> li
return list(result) return list(result)
def _token_to_piece(model: _LlamaModel, token: int) -> str: def _token_to_piece(model: _LlamaModel, token: int, special: bool = False) -> str:
assert model.model is not None assert model.model is not None
result = (ctypes.c_char * 8)(0) result = (ctypes.c_char * 8)(0)
n_tokens = llama_cpp.llama_token_to_piece(model.model, token, result, len(result)) n_tokens = llama_cpp.llama_token_to_piece(model.model, token, result, len(result), special)
if n_tokens < 0: if n_tokens < 0:
result = (ctypes.c_char * -n_tokens)(0) result = (ctypes.c_char * -n_tokens)(0)
check = llama_cpp.llama_token_to_piece(model.model, token, result, len(result)) check = llama_cpp.llama_token_to_piece(model.model, token, result, len(result), special)
if check != -n_tokens: if check != -n_tokens:
raise RuntimeError(f"Failed to get piece: token={token}") raise RuntimeError(f"Failed to get piece: token={token}")
else: else:

View file

@ -2380,6 +2380,18 @@ def llama_token_get_type(
... ...
# // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
# LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
@ctypes_function(
"llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool
)
def llama_token_is_eog(
model: llama_model_p, token: Union[llama_token, int], /
) -> bool:
"""Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)"""
...
# // Special tokens # // Special tokens
@ -2434,7 +2446,7 @@ def llama_add_eos_token(model: llama_model_p, /) -> int:
... ...
# // codellama infill tokens # // Codellama infill tokens
# LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix # LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token) @ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token)
def llama_token_prefix(model: llama_model_p) -> int: def llama_token_prefix(model: llama_model_p) -> int:
@ -2524,11 +2536,13 @@ def llama_tokenize(
# // Uses the vocabulary in the provided context. # // Uses the vocabulary in the provided context.
# // Does not write null terminator to the buffer. # // Does not write null terminator to the buffer.
# // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. # // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
# // @param special If true, special tokens are rendered in the output.
# LLAMA_API int32_t llama_token_to_piece( # LLAMA_API int32_t llama_token_to_piece(
# const struct llama_model * model, # const struct llama_model * model,
# llama_token token, # llama_token token,
# char * buf, # char * buf,
# int32_t length); # int32_t length,
# bool special);
@ctypes_function( @ctypes_function(
"llama_token_to_piece", "llama_token_to_piece",
[ [
@ -2536,6 +2550,7 @@ def llama_tokenize(
llama_token, llama_token,
ctypes.c_char_p, ctypes.c_char_p,
ctypes.c_int32, ctypes.c_int32,
ctypes.c_bool,
], ],
ctypes.c_int32, ctypes.c_int32,
) )
@ -2544,13 +2559,20 @@ def llama_token_to_piece(
token: Union[llama_token, int], token: Union[llama_token, int],
buf: Union[ctypes.c_char_p, bytes, CtypesArray[ctypes.c_char]], buf: Union[ctypes.c_char_p, bytes, CtypesArray[ctypes.c_char]],
length: Union[ctypes.c_int, int], length: Union[ctypes.c_int, int],
special: Union[ctypes.c_bool, bool],
/, /,
) -> int: ) -> int:
"""Token Id -> Piece. """Token Id -> Piece.
Uses the vocabulary in the provided context. Uses the vocabulary in the provided context.
Does not write null terminator to the buffer. Does not write null terminator to the buffer.
User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
"""
Args:
model: The model to use for tokenization.
token: The token to convert.
buf: The buffer to write the token to.
length: The length of the buffer.
special: If true, special tokens are rendered in the output."""
... ...

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 3b8f1ec4b18770531d0b1d792f3edf08254e4f0c Subproject commit 5cf5e7d490dfdd2e70bface2d35dfd14aa44b4fb