Added both LlamaChache classes Disk and RAM.

2023-05-31 22:33:56 +02:00 · 2023-05-31 22:33:56 +02:00 · 29f9c9cca3
commit 29f9c9cca3
parent 9ea7a379d3
1 changed files with 263 additions and 193 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -4,6 +4,7 @@ import uuid
 import time
 import math
 import multiprocessing
+from abc import ABC
 from typing import (
    List,
    Optional,
@ -26,13 +27,86 @@ import numpy as np
 import numpy.typing as npt


+class LlamaCache(ABC):
+    """Base cache class for a llama.cpp model."""

-class LlamaCache:
-    """Cache for a llama.cpp model."""
+    def __init__(self, capacity_bytes: int = (2 << 30)):
+        pass
+
+    @property
+    def cache_size(self):
+        return 0
+
+    def _find_longest_prefix_key(
+            self,
+            key: Tuple[int, ...],
+    ) -> Optional[Tuple[int, ...]]:
+        pass
+
+    def __getitem__(self, key: Sequence[int]) -> "LlamaState":
+        pass
+
+    def __contains__(self, key: Sequence[int]) -> bool:
+        pass
+
+    def __setitem__(self, key: Sequence[int], value: "LlamaState"):
+        pass
+
+
+class LlamaRAMCache(LlamaCache):
+    """Cache for a llama.cpp model using RAM."""
+
+    def __init__(self, capacity_bytes: int = (2 << 30)):
+        super().__init__(capacity_bytes)
+        self.capacity_bytes = capacity_bytes
+        self.cache_state: OrderedDict[Tuple[int, ...], "LlamaState"] = OrderedDict()
+
+    @property
+    def cache_size(self):
+        return sum([state.llama_state_size for state in self.cache_state.values()])
+
+    def _find_longest_prefix_key(
+            self,
+            key: Tuple[int, ...],
+    ) -> Optional[Tuple[int, ...]]:
+        min_len = 0
+        min_key = None
+        keys = (
+            (k, Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys()
+        )
+        for k, prefix_len in keys:
+            if prefix_len > min_len:
+                min_len = prefix_len
+                min_key = k
+        return min_key
+
+    def __getitem__(self, key: Sequence[int]) -> "LlamaState":
+        key = tuple(key)
+        _key = self._find_longest_prefix_key(key)
+        if _key is None:
+            raise KeyError("Key not found")
+        value = self.cache_state[_key]
+        self.cache_state.move_to_end(_key)
+        return value
+
+    def __contains__(self, key: Sequence[int]) -> bool:
+        return self._find_longest_prefix_key(tuple(key)) is not None
+
+    def __setitem__(self, key: Sequence[int], value: "LlamaState"):
+        key = tuple(key)
+        if key in self.cache_state:
+            del self.cache_state[key]
+        self.cache_state[key] = value
+        while self.cache_size > self.capacity_bytes:
+            self.cache_state.popitem(last=False)
+
+
+class LlamaDiskCache(LlamaCache):
+    """Cache for a llama.cpp model using disk."""

    def __init__(self, cache_dir="./llama_cache", capacity_bytes: int = (2 << 30)):
+        super().__init__(capacity_bytes)
        self.cache = diskcache.Cache(cache_dir)
-        self.capacity_bytes = capacity_bytes

    @property
    def cache_size(self):
@ -60,9 +134,6 @@ class LlamaCache:
        self.cache.push(_key)
        return value

-    def __contains__(self, key: Sequence[int]) -> bool:
-        return self._find_longest_prefix_key(tuple(key)) is not None
-
    def __setitem__(self, key: Sequence[int], value: "LlamaState"):
        key = tuple(key)
        if key in self.cache:
@ -73,7 +144,6 @@ class LlamaCache:
            del self.cache[key_to_remove]


-
 class LlamaState:
    def __init__(
            self,