From 19ba9d3845d4e4bbfdec3fd2dd346bc0c6d3653c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 18 Jul 2023 19:27:41 -0400 Subject: [PATCH] Use numpy arrays for logits_processors and stopping_criteria. Closes #491 --- llama_cpp/llama.py | 31 ++++++++++++++++++------------- llama_cpp/server/app.py | 9 ++++++--- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 92ca67d..b23280d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -27,6 +27,7 @@ from .llama_types import * import numpy as np import numpy.typing as npt + class BaseLlamaCache(ABC): """Base cache class for a llama.cpp model.""" @@ -179,21 +180,27 @@ class LlamaState: self.llama_state_size = llama_state_size -LogitsProcessor = Callable[[List[int], List[float]], List[float]] +LogitsProcessor = Callable[ + [npt.NDArray[np.intc], npt.NDArray[np.single]], npt.NDArray[np.single] +] class LogitsProcessorList(List[LogitsProcessor]): - def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: + def __call__( + self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single] + ) -> npt.NDArray[np.single]: for processor in self: scores = processor(input_ids, scores) return scores -StoppingCriteria = Callable[[List[int], List[float]], bool] +StoppingCriteria = Callable[[npt.NDArray[np.intc], npt.NDArray[np.single]], bool] class StoppingCriteriaList(List[StoppingCriteria]): - def __call__(self, input_ids: List[int], logits: List[float]) -> bool: + def __call__( + self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single] + ) -> bool: return any([stopping_criteria(input_ids, logits) for stopping_criteria in self]) @@ -274,9 +281,11 @@ class Llama: self._c_tensor_split = None if self.tensor_split is not None: - #Type conversion and expand the list to the length of LLAMA_MAX_DEVICES + # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value - self._c_tensor_split = FloatArray(*tensor_split) # keep a reference to the array so it is not gc'd + self._c_tensor_split = FloatArray( + *tensor_split + ) # keep a reference to the array so it is not gc'd self.params.tensor_split = self._c_tensor_split self.params.rope_freq_base = rope_freq_base @@ -503,11 +512,7 @@ class Llama: logits: npt.NDArray[np.single] = self._scores[-1, :] if logits_processor is not None: - logits = np.array( - logits_processor(self._input_ids.tolist(), logits.tolist()), - dtype=np.single, - ) - self._scores[-1, :] = logits + logits[:] = logits_processor(self._input_ids, logits) nl_logit = logits[self._token_nl] candidates = self._candidates @@ -725,7 +730,7 @@ class Llama: logits_processor=logits_processor, ) if stopping_criteria is not None and stopping_criteria( - self._input_ids.tolist(), self._scores[-1, :].tolist() + self._input_ids, self._scores[-1, :] ): return tokens_or_none = yield token @@ -1014,7 +1019,7 @@ class Llama: break if stopping_criteria is not None and stopping_criteria( - self._input_ids.tolist(), self._scores[-1, :].tolist() + self._input_ids, self._scores[-1, :] ): text = self.detokenize(completion_tokens) finish_reason = "stop" diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 7c241fb..8a9b818 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -16,6 +16,9 @@ from pydantic import BaseModel, Field from pydantic_settings import BaseSettings from sse_starlette.sse import EventSourceResponse +import numpy as np +import numpy.typing as npt + class Settings(BaseSettings): model: str = Field( @@ -336,9 +339,9 @@ def make_logit_bias_processor( to_bias[input_id] = score def logit_bias_processor( - input_ids: List[int], - scores: List[float], - ) -> List[float]: + input_ids: npt.NDArray[np.intc], + scores: npt.NDArray[np.single], + ) -> npt.NDArray[np.single]: new_scores = [None] * len(scores) for input_id, score in enumerate(scores): new_scores[input_id] = score + to_bias.get(input_id, 0.0)