a05d90446f
commit 901827013b
introduced a cyclic dependency
within Llama objects. That change causes old models to linger in memory longer
than necessary, thereby creating memory bloat in most applications attempting
to switch between models at runtime. This patch simply removes the problematic
line, allowing models to deallocate without relying on GC. One might also
consider combining `weakref.ref` with a `@property` if the `llama` attribute is
absolutely necessary to expose in the tokenizer class.
95 lines
3 KiB
Python
95 lines
3 KiB
Python
from __future__ import annotations
|
|
|
|
import abc
|
|
from typing import (
|
|
List,
|
|
Optional,
|
|
Any,
|
|
)
|
|
|
|
import llama_cpp
|
|
from llama_cpp.llama_types import List
|
|
|
|
|
|
class BaseLlamaTokenizer(abc.ABC):
|
|
@abc.abstractmethod
|
|
def tokenize(
|
|
self, text: bytes, add_bos: bool = True, special: bool = True
|
|
) -> List[int]:
|
|
raise NotImplementedError
|
|
|
|
@abc.abstractmethod
|
|
def detokenize(
|
|
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
|
|
) -> bytes:
|
|
raise NotImplementedError
|
|
|
|
|
|
class LlamaTokenizer(BaseLlamaTokenizer):
|
|
def __init__(self, llama: llama_cpp.Llama):
|
|
self._model = llama._model # type: ignore
|
|
|
|
def tokenize(
|
|
self, text: bytes, add_bos: bool = True, special: bool = True
|
|
) -> List[int]:
|
|
return self._model.tokenize(text, add_bos=add_bos, special=special)
|
|
|
|
def detokenize(
|
|
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
|
|
) -> bytes:
|
|
if prev_tokens is not None:
|
|
return self._model.detokenize(tokens[len(prev_tokens) :])
|
|
else:
|
|
return self._model.detokenize(tokens)
|
|
|
|
def encode(
|
|
self, text: str, add_bos: bool = True, special: bool = True
|
|
) -> List[int]:
|
|
return self.tokenize(
|
|
text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
|
|
)
|
|
|
|
def decode(self, tokens: List[int]) -> str:
|
|
return self.detokenize(tokens).decode("utf-8", errors="ignore")
|
|
|
|
@classmethod
|
|
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
|
|
return cls(llama_cpp.Llama(model_path=path, vocab_only=True))
|
|
|
|
|
|
class LlamaHFTokenizer(BaseLlamaTokenizer):
|
|
def __init__(self, hf_tokenizer: Any):
|
|
self.hf_tokenizer = hf_tokenizer
|
|
|
|
def tokenize(
|
|
self, text: bytes, add_bos: bool = True, special: bool = True
|
|
) -> List[int]:
|
|
return self.hf_tokenizer.encode(
|
|
text.decode("utf-8", errors="ignore"), add_special_tokens=special
|
|
)
|
|
|
|
def detokenize(
|
|
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
|
|
) -> bytes:
|
|
if prev_tokens is not None:
|
|
text = self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
|
|
prev_text = self.hf_tokenizer.decode(prev_tokens).encode(
|
|
"utf-8", errors="ignore"
|
|
)
|
|
return text[len(prev_text) :]
|
|
else:
|
|
return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
|
|
|
|
@classmethod
|
|
def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
|
|
try:
|
|
from transformers import AutoTokenizer
|
|
except ImportError:
|
|
raise ImportError(
|
|
"The `transformers` library is required to use the `HFTokenizer`."
|
|
"You can install it with `pip install transformers`."
|
|
)
|
|
hf_tokenizer = AutoTokenizer.from_pretrained(
|
|
pretrained_model_name_or_path=pretrained_model_name_or_path
|
|
)
|
|
return cls(hf_tokenizer)
|