Add LlamaTokenizer class
This commit is contained in:
parent
1d247e0f35
commit
ca01f98e09
1 changed files with 20 additions and 0 deletions
|
@ -1380,6 +1380,11 @@ class Llama:
|
||||||
assert self.ctx is not None
|
assert self.ctx is not None
|
||||||
return llama_cpp.llama_n_vocab(self.ctx)
|
return llama_cpp.llama_n_vocab(self.ctx)
|
||||||
|
|
||||||
|
def tokenizer(self) -> "LlamaTokenizer":
|
||||||
|
"""Return the tokenizer for this model."""
|
||||||
|
assert self.ctx is not None
|
||||||
|
return LlamaTokenizer(self)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def token_eos() -> int:
|
def token_eos() -> int:
|
||||||
"""Return the end-of-sequence token."""
|
"""Return the end-of-sequence token."""
|
||||||
|
@ -1410,3 +1415,18 @@ class Llama:
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
return longest_prefix
|
return longest_prefix
|
||||||
|
|
||||||
|
|
||||||
|
class LlamaTokenizer:
|
||||||
|
def __init__(self, llama: Llama):
|
||||||
|
self.llama = llama
|
||||||
|
|
||||||
|
def encode(self, text: str) -> List[int]:
|
||||||
|
return self.llama.tokenize(text.encode("utf-8", errors="ignore"))
|
||||||
|
|
||||||
|
def decode(self, tokens: List[int]) -> str:
|
||||||
|
return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
|
||||||
|
return cls(Llama(model_path=path, vocab_only=True))
|
||||||
|
|
Loading…
Reference in a new issue