From 5be8354e11e5b5cf99963eefc2c13541d60c0634 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 03:00:51 -0400 Subject: [PATCH] Added tokenizer --- llama_cpp/llama.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 0978e1e..82246d1 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1416,8 +1416,10 @@ class LlamaTokenizer: def __init__(self, llama: Llama): self.llama = llama - def encode(self, text: str) -> List[int]: - return self.llama.tokenize(text.encode("utf-8", errors="ignore")) + def encode(self, text: str, add_bos: bool = True) -> List[int]: + return self.llama.tokenize( + text.encode("utf-8", errors="ignore"), add_bos=add_bos + ) def decode(self, tokens: List[int]) -> str: return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")