diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 173f132..f24d7ec 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -213,7 +213,7 @@ class _LlamaModel: NOTE: For stability it's recommended you use the Llama class instead.""" - _llama_free_model = llama_cpp._lib.llama_free_model # type: ignore + _llama_free_model = None def __init__( self, @@ -226,6 +226,8 @@ class _LlamaModel: self.params = params self.verbose = verbose + self._llama_free_model = llama_cpp._lib.llama_free_model # type: ignore + if not os.path.exists(path_model): raise ValueError(f"Model path does not exist: {path_model}") @@ -236,7 +238,7 @@ class _LlamaModel: def __del__(self): with suppress_stdout_stderr(disable=self.verbose): - if self.model is not None: + if self.model is not None and self._llama_free_model is not None: self._llama_free_model(self.model) self.model = None @@ -396,7 +398,7 @@ class _LlamaContext: NOTE: For stability it's recommended you use the Llama class instead.""" - _llama_free = llama_cpp._lib.llama_free # type: ignore + _llama_free = None def __init__( self, @@ -409,6 +411,8 @@ class _LlamaContext: self.params = params self.verbose = verbose + self._llama_free = llama_cpp._lib.llama_free # type: ignore + with suppress_stdout_stderr(disable=self.verbose): self.ctx = llama_cpp.llama_new_context_with_model( self.model.model, self.params @@ -416,7 +420,7 @@ class _LlamaContext: def __del__(self): with suppress_stdout_stderr(disable=self.verbose): - if self.ctx is not None: + if self.ctx is not None and self._llama_free is not None: self._llama_free(self.ctx) self.ctx = None @@ -645,7 +649,7 @@ class _LlamaContext: class _LlamaBatch: - _llama_batch_free = llama_cpp._lib.llama_batch_free # type: ignore + _llama_batch_free = None def __init__( self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True @@ -655,6 +659,8 @@ class _LlamaBatch: self.n_seq_max = n_seq_max self.verbose = verbose + self._llama_batch_free = llama_cpp._lib.llama_batch_free # type: ignore + with suppress_stdout_stderr(disable=self.verbose): self.batch = llama_cpp.llama_batch_init( self.n_tokens, self.embd, self.n_seq_max @@ -662,7 +668,7 @@ class _LlamaBatch: def __del__(self): with suppress_stdout_stderr(disable=self.verbose): - if self.batch is not None: + if self.batch is not None and self._llama_batch_free is not None: self._llama_batch_free(self.batch) self.batch = None diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 512de6f..c03a635 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -9,6 +9,8 @@ import llama_cpp.llama as llama import llama_cpp.llama_types as llama_types import llama_cpp.llama_grammar as llama_grammar +from ._utils import suppress_stdout_stderr + class LlamaChatCompletionHandler(Protocol): def __call__( @@ -775,20 +777,26 @@ def functionary_chat_handler( class Llava15ChatHandler: - def __init__(self, clip_model_path: str): + _clip_free = None + + def __init__(self, clip_model_path: str, verbose: bool = False): import llama_cpp.llava_cpp as llava_cpp self._llava_cpp = llava_cpp self.clip_model_path = clip_model_path + self.verbose = verbose + self._clip_free = self._llava_cpp._libllava.clip_free # type: ignore - self.clip_ctx = self._llava_cpp.clip_model_load( - self.clip_model_path.encode(), 0 - ) + with suppress_stdout_stderr(disable=self.verbose): + self.clip_ctx = self._llava_cpp.clip_model_load( + self.clip_model_path.encode(), 0 + ) def __del__(self): - if self.clip_ctx is not None: - self._llava_cpp.clip_free(self.clip_ctx) - self.clip_ctx = None + with suppress_stdout_stderr(disable=self.verbose): + if self.clip_ctx is not None and self._clip_free is not None: + self._clip_free(self.clip_ctx) + self.clip_ctx = None def load_image(self, image_url: str) -> bytes: if image_url.startswith("data:"): @@ -881,27 +889,28 @@ class Llava15ChatHandler: c_ubyte_ptr = ( ctypes.c_ubyte * len(data_array) ).from_buffer(data_array) - embed = self._llava_cpp.llava_image_embed_make_with_bytes( - ctx_clip=self.clip_ctx, - n_threads=llama.context_params.n_threads, - image_bytes=c_ubyte_ptr, - image_bytes_length=len(image_bytes), - ) - # image_bytes_p = (ctypes.c_uint8 * len(image_bytes)).from_buffer_copy(image_bytes) - # embed = self._llava_cpp.llava_image_embed_make_with_bytes(ctx_clip=self.clip_ctx, n_threads=1, image_bytes=image_bytes_p, image_bytes_length=len(image_bytes)) + with suppress_stdout_stderr(disable=self.verbose): + embed = self._llava_cpp.llava_image_embed_make_with_bytes( + ctx_clip=self.clip_ctx, + n_threads=llama.context_params.n_threads, + image_bytes=c_ubyte_ptr, + image_bytes_length=len(image_bytes), + ) try: n_past = ctypes.c_int(llama.n_tokens) n_past_p = ctypes.pointer(n_past) - self._llava_cpp.llava_eval_image_embed( - ctx_llama=llama.ctx, - embed=embed, - n_batch=llama.n_batch, - n_past=n_past_p, - ) + with suppress_stdout_stderr(disable=self.verbose): + self._llava_cpp.llava_eval_image_embed( + ctx_llama=llama.ctx, + embed=embed, + n_batch=llama.n_batch, + n_past=n_past_p, + ) assert llama.n_ctx() >= n_past.value llama.n_tokens = n_past.value finally: - self._llava_cpp.llava_image_embed_free(embed) + with suppress_stdout_stderr(disable=self.verbose): + self._llava_cpp.llava_image_embed_free(embed) if message["role"] == "assistant" and message["content"] is not None: llama.eval( llama.tokenize( @@ -910,7 +919,7 @@ class Llava15ChatHandler: ) llama.eval(llama.tokenize(f"{assistant_role}".encode("utf8"), add_bos=False)) - prompt = llama._input_ids.tolist() + prompt = llama.input_ids[:llama.n_tokens].tolist() return _convert_completion_to_chat( llama.create_completion( diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index e4e1891..3baeee8 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -384,7 +384,7 @@ def create_app(settings: Optional[Settings] = None): chat_handler = None if settings.chat_format == "llava-1-5": assert settings.clip_model_path is not None - chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path) + chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path, verbose=settings.verbose) ## llama = llama_cpp.Llama(