diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index e4be9d1..c6b55ab 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -3,7 +3,6 @@ import sys import uuid import time import multiprocessing -from abc import ABC, abstractmethod from typing import ( List, Optional, @@ -12,16 +11,20 @@ from typing import ( Sequence, Iterator, Deque, - Tuple, Callable, ) -from collections import deque, OrderedDict +from collections import deque -import diskcache import ctypes from .llama_types import * from .llama_grammar import LlamaGrammar +from .llama_cache import ( + BaseLlamaCache, + LlamaCache, # type: ignore + LlamaDiskCache, # type: ignore + LlamaRAMCache, # type: ignore +) import llama_cpp.llama_cpp as llama_cpp import llama_cpp.llama_chat_format as llama_chat_format @@ -31,142 +34,6 @@ import numpy.typing as npt from ._utils import suppress_stdout_stderr -class BaseLlamaCache(ABC): - """Base cache class for a llama.cpp model.""" - - def __init__(self, capacity_bytes: int = (2 << 30)): - self.capacity_bytes = capacity_bytes - - @property - @abstractmethod - def cache_size(self) -> int: - raise NotImplementedError - - def _find_longest_prefix_key( - self, - key: Tuple[int, ...], - ) -> Optional[Tuple[int, ...]]: - pass - - @abstractmethod - def __getitem__(self, key: Sequence[int]) -> "LlamaState": - raise NotImplementedError - - @abstractmethod - def __contains__(self, key: Sequence[int]) -> bool: - raise NotImplementedError - - @abstractmethod - def __setitem__(self, key: Sequence[int], value: "LlamaState") -> None: - raise NotImplementedError - - -class LlamaRAMCache(BaseLlamaCache): - """Cache for a llama.cpp model using RAM.""" - - def __init__(self, capacity_bytes: int = (2 << 30)): - super().__init__(capacity_bytes) - self.capacity_bytes = capacity_bytes - self.cache_state: OrderedDict[Tuple[int, ...], "LlamaState"] = OrderedDict() - - @property - def cache_size(self): - return sum([state.llama_state_size for state in self.cache_state.values()]) - - def _find_longest_prefix_key( - self, - key: Tuple[int, ...], - ) -> Optional[Tuple[int, ...]]: - min_len = 0 - min_key = None - keys = ( - (k, Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() - ) - for k, prefix_len in keys: - if prefix_len > min_len: - min_len = prefix_len - min_key = k - return min_key - - def __getitem__(self, key: Sequence[int]) -> "LlamaState": - key = tuple(key) - _key = self._find_longest_prefix_key(key) - if _key is None: - raise KeyError("Key not found") - value = self.cache_state[_key] - self.cache_state.move_to_end(_key) - return value - - def __contains__(self, key: Sequence[int]) -> bool: - return self._find_longest_prefix_key(tuple(key)) is not None - - def __setitem__(self, key: Sequence[int], value: "LlamaState"): - key = tuple(key) - if key in self.cache_state: - del self.cache_state[key] - self.cache_state[key] = value - while self.cache_size > self.capacity_bytes and len(self.cache_state) > 0: - self.cache_state.popitem(last=False) - - -# Alias for backwards compatibility -LlamaCache = LlamaRAMCache - - -class LlamaDiskCache(BaseLlamaCache): - """Cache for a llama.cpp model using disk.""" - - def __init__( - self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30) - ): - super().__init__(capacity_bytes) - self.cache = diskcache.Cache(cache_dir) - - @property - def cache_size(self): - return int(self.cache.volume()) # type: ignore - - def _find_longest_prefix_key( - self, - key: Tuple[int, ...], - ) -> Optional[Tuple[int, ...]]: - min_len = 0 - min_key: Optional[Tuple[int, ...]] = None - for k in self.cache.iterkeys(): # type: ignore - prefix_len = Llama.longest_token_prefix(k, key) - if prefix_len > min_len: - min_len = prefix_len - min_key = k # type: ignore - return min_key - - def __getitem__(self, key: Sequence[int]) -> "LlamaState": - key = tuple(key) - _key = self._find_longest_prefix_key(key) - if _key is None: - raise KeyError("Key not found") - value: "LlamaState" = self.cache.pop(_key) # type: ignore - # NOTE: This puts an integer as key in cache, which breaks, - # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens - # self.cache.push(_key, side="front") # type: ignore - return value - - def __contains__(self, key: Sequence[int]) -> bool: - return self._find_longest_prefix_key(tuple(key)) is not None - - def __setitem__(self, key: Sequence[int], value: "LlamaState"): - print("LlamaDiskCache.__setitem__: called", file=sys.stderr) - key = tuple(key) - if key in self.cache: - print("LlamaDiskCache.__setitem__: delete", file=sys.stderr) - del self.cache[key] - self.cache[key] = value - print("LlamaDiskCache.__setitem__: set", file=sys.stderr) - while self.cache_size > self.capacity_bytes and len(self.cache) > 0: - key_to_remove = next(iter(self.cache)) - del self.cache[key_to_remove] - print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) - - class LlamaState: def __init__( self, diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py new file mode 100644 index 0000000..9e9870a --- /dev/null +++ b/llama_cpp/llama_cache.py @@ -0,0 +1,150 @@ +import sys +from abc import ABC, abstractmethod +from typing import ( + Optional, + Sequence, + Tuple, +) +from collections import OrderedDict + +import diskcache + +import llama_cpp.llama + +from .llama_types import * + + +class BaseLlamaCache(ABC): + """Base cache class for a llama.cpp model.""" + + def __init__(self, capacity_bytes: int = (2 << 30)): + self.capacity_bytes = capacity_bytes + + @property + @abstractmethod + def cache_size(self) -> int: + raise NotImplementedError + + def _find_longest_prefix_key( + self, + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: + pass + + @abstractmethod + def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + raise NotImplementedError + + @abstractmethod + def __contains__(self, key: Sequence[int]) -> bool: + raise NotImplementedError + + @abstractmethod + def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState") -> None: + raise NotImplementedError + + +class LlamaRAMCache(BaseLlamaCache): + """Cache for a llama.cpp model using RAM.""" + + def __init__(self, capacity_bytes: int = (2 << 30)): + super().__init__(capacity_bytes) + self.capacity_bytes = capacity_bytes + self.cache_state: OrderedDict[Tuple[int, ...], "llama_cpp.llama.LlamaState"] = OrderedDict() + + @property + def cache_size(self): + return sum([state.llama_state_size for state in self.cache_state.values()]) + + def _find_longest_prefix_key( + self, + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: + min_len = 0 + min_key = None + keys = ( + (k, llama_cpp.llama.Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() + ) + for k, prefix_len in keys: + if prefix_len > min_len: + min_len = prefix_len + min_key = k + return min_key + + def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + key = tuple(key) + _key = self._find_longest_prefix_key(key) + if _key is None: + raise KeyError("Key not found") + value = self.cache_state[_key] + self.cache_state.move_to_end(_key) + return value + + def __contains__(self, key: Sequence[int]) -> bool: + return self._find_longest_prefix_key(tuple(key)) is not None + + def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): + key = tuple(key) + if key in self.cache_state: + del self.cache_state[key] + self.cache_state[key] = value + while self.cache_size > self.capacity_bytes and len(self.cache_state) > 0: + self.cache_state.popitem(last=False) + + +# Alias for backwards compatibility +LlamaCache = LlamaRAMCache + + +class LlamaDiskCache(BaseLlamaCache): + """Cache for a llama.cpp model using disk.""" + + def __init__( + self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30) + ): + super().__init__(capacity_bytes) + self.cache = diskcache.Cache(cache_dir) + + @property + def cache_size(self): + return int(self.cache.volume()) # type: ignore + + def _find_longest_prefix_key( + self, + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: + min_len = 0 + min_key: Optional[Tuple[int, ...]] = None + for k in self.cache.iterkeys(): # type: ignore + prefix_len = llama_cpp.llama.Llama.longest_token_prefix(k, key) + if prefix_len > min_len: + min_len = prefix_len + min_key = k # type: ignore + return min_key + + def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + key = tuple(key) + _key = self._find_longest_prefix_key(key) + if _key is None: + raise KeyError("Key not found") + value: "llama_cpp.llama.LlamaState" = self.cache.pop(_key) # type: ignore + # NOTE: This puts an integer as key in cache, which breaks, + # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens + # self.cache.push(_key, side="front") # type: ignore + return value + + def __contains__(self, key: Sequence[int]) -> bool: + return self._find_longest_prefix_key(tuple(key)) is not None + + def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): + print("LlamaDiskCache.__setitem__: called", file=sys.stderr) + key = tuple(key) + if key in self.cache: + print("LlamaDiskCache.__setitem__: delete", file=sys.stderr) + del self.cache[key] + self.cache[key] = value + print("LlamaDiskCache.__setitem__: set", file=sys.stderr) + while self.cache_size > self.capacity_bytes and len(self.cache) > 0: + key_to_remove = next(iter(self.cache)) + del self.cache[key_to_remove] + print("LlamaDiskCache.__setitem__: trim", file=sys.stderr)