From 19ba9d3845d4e4bbfdec3fd2dd346bc0c6d3653c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 18 Jul 2023 19:27:41 -0400
Subject: [PATCH] Use numpy arrays for logits_processors and stopping_criteria.
 Closes #491

---
 llama_cpp/llama.py      | 31 ++++++++++++++++++-------------
 llama_cpp/server/app.py |  9 ++++++---
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 92ca67d..b23280d 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -27,6 +27,7 @@ from .llama_types import *
 import numpy as np
 import numpy.typing as npt
 
+
 class BaseLlamaCache(ABC):
     """Base cache class for a llama.cpp model."""
 
@@ -179,21 +180,27 @@ class LlamaState:
         self.llama_state_size = llama_state_size
 
 
-LogitsProcessor = Callable[[List[int], List[float]], List[float]]
+LogitsProcessor = Callable[
+    [npt.NDArray[np.intc], npt.NDArray[np.single]], npt.NDArray[np.single]
+]
 
 
 class LogitsProcessorList(List[LogitsProcessor]):
-    def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]:
+    def __call__(
+        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
+    ) -> npt.NDArray[np.single]:
         for processor in self:
             scores = processor(input_ids, scores)
         return scores
 
 
-StoppingCriteria = Callable[[List[int], List[float]], bool]
+StoppingCriteria = Callable[[npt.NDArray[np.intc], npt.NDArray[np.single]], bool]
 
 
 class StoppingCriteriaList(List[StoppingCriteria]):
-    def __call__(self, input_ids: List[int], logits: List[float]) -> bool:
+    def __call__(
+        self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
+    ) -> bool:
         return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
 
 
@@ -274,9 +281,11 @@ class Llama:
         self._c_tensor_split = None
 
         if self.tensor_split is not None:
-            #Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
+            # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
             FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
-            self._c_tensor_split = FloatArray(*tensor_split) # keep a reference to the array so it is not gc'd
+            self._c_tensor_split = FloatArray(
+                *tensor_split
+            )  # keep a reference to the array so it is not gc'd
             self.params.tensor_split = self._c_tensor_split
 
         self.params.rope_freq_base = rope_freq_base
@@ -503,11 +512,7 @@ class Llama:
         logits: npt.NDArray[np.single] = self._scores[-1, :]
 
         if logits_processor is not None:
-            logits = np.array(
-                logits_processor(self._input_ids.tolist(), logits.tolist()),
-                dtype=np.single,
-            )
-            self._scores[-1, :] = logits
+            logits[:] = logits_processor(self._input_ids, logits)
 
         nl_logit = logits[self._token_nl]
         candidates = self._candidates
@@ -725,7 +730,7 @@ class Llama:
                 logits_processor=logits_processor,
             )
             if stopping_criteria is not None and stopping_criteria(
-                self._input_ids.tolist(), self._scores[-1, :].tolist()
+                self._input_ids, self._scores[-1, :]
             ):
                 return
             tokens_or_none = yield token
@@ -1014,7 +1019,7 @@ class Llama:
                 break
 
         if stopping_criteria is not None and stopping_criteria(
-            self._input_ids.tolist(), self._scores[-1, :].tolist()
+            self._input_ids, self._scores[-1, :]
         ):
             text = self.detokenize(completion_tokens)
             finish_reason = "stop"
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 7c241fb..8a9b818 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -16,6 +16,9 @@ from pydantic import BaseModel, Field
 from pydantic_settings import BaseSettings
 from sse_starlette.sse import EventSourceResponse
 
+import numpy as np
+import numpy.typing as npt
+
 
 class Settings(BaseSettings):
     model: str = Field(
@@ -336,9 +339,9 @@ def make_logit_bias_processor(
                 to_bias[input_id] = score
 
     def logit_bias_processor(
-        input_ids: List[int],
-        scores: List[float],
-    ) -> List[float]:
+        input_ids: npt.NDArray[np.intc],
+        scores: npt.NDArray[np.single],
+    ) -> npt.NDArray[np.single]:
         new_scores = [None] * len(scores)
         for input_id, score in enumerate(scores):
             new_scores[input_id] = score + to_bias.get(input_id, 0.0)