Add LlamaTokenizer class

This commit is contained in:
Andrei Betlen 2023-05-25 14:11:33 -04:00
parent 1d247e0f35
commit ca01f98e09

View file

@ -1380,6 +1380,11 @@ class Llama:
assert self.ctx is not None assert self.ctx is not None
return llama_cpp.llama_n_vocab(self.ctx) return llama_cpp.llama_n_vocab(self.ctx)
def tokenizer(self) -> "LlamaTokenizer":
"""Return the tokenizer for this model."""
assert self.ctx is not None
return LlamaTokenizer(self)
@staticmethod @staticmethod
def token_eos() -> int: def token_eos() -> int:
"""Return the end-of-sequence token.""" """Return the end-of-sequence token."""
@ -1410,3 +1415,18 @@ class Llama:
else: else:
break break
return longest_prefix return longest_prefix
class LlamaTokenizer:
def __init__(self, llama: Llama):
self.llama = llama
def encode(self, text: str) -> List[int]:
return self.llama.tokenize(text.encode("utf-8", errors="ignore"))
def decode(self, tokens: List[int]) -> str:
return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
@classmethod
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
return cls(Llama(model_path=path, vocab_only=True))