From f4090a0bb2a2a25acfe28d31c82cc1aa273bedee Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 13 Sep 2023 23:00:43 -0400 Subject: [PATCH] Add numa support, low level api users must now explicitly call llama_backend_init at the start of their programs. --- README.md | 1 + examples/low_level_api/low_level_api_llama_cpp.py | 2 ++ llama_cpp/llama.py | 13 +++++++++++++ llama_cpp/llama_cpp.py | 9 --------- llama_cpp/server/app.py | 4 ++++ 5 files changed, 20 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 7e64f99..07accb6 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize ```python >>> import llama_cpp >>> import ctypes +>>> llama_cpp.llama_backend_init(numa=False) # Must be called once at the start of each program >>> params = llama_cpp.llama_context_default_params() # use bytes for char * params >>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params) diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py index ad8f33c..e3cff32 100644 --- a/examples/low_level_api/low_level_api_llama_cpp.py +++ b/examples/low_level_api/low_level_api_llama_cpp.py @@ -4,6 +4,8 @@ import multiprocessing import llama_cpp +llama_cpp.llama_backend_init(numa=False) + N_THREADS = multiprocessing.cpu_count() MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin") diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5040205..5d093be 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -209,6 +209,8 @@ class StoppingCriteriaList(List[StoppingCriteria]): class Llama: """High-level Python wrapper for a llama.cpp model.""" + __backend_initialized = False + def __init__( self, model_path: str, @@ -234,6 +236,7 @@ class Llama: last_n_tokens_size: int = 64, lora_base: Optional[str] = None, lora_path: Optional[str] = None, + numa: bool = False, verbose: bool = True, **kwargs # type: ignore ): @@ -261,6 +264,7 @@ class Llama: last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. + numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init) verbose: Print verbose output to stderr. kwargs: Unused keyword arguments (for additional backwards compatibility). @@ -272,6 +276,15 @@ class Llama: """ self.verbose = verbose + + if not Llama.__backend_initialized: + if self.verbose: + llama_cpp.llama_backend_init(numa) + else: + with suppress_stdout_stderr(): + llama_cpp.llama_backend_init(numa) + Llama.__backend_initialized = True + self.model_path = model_path self.params = llama_cpp.llama_context_default_params() diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index c6465f5..5cad038 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1524,12 +1524,3 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p): _lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p] _lib.llama_dump_timing_info_yaml.restype = None - -################################################################################################### - - -_llama_initialized = False - -if not _llama_initialized: - llama_backend_init(False) - _llama_initialized = True diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 9e29555..9f47c5f 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -98,6 +98,10 @@ class Settings(BaseSettings): default=None, description="Path to a LoRA file to apply to the model.", ) + numa: bool = Field( + default=False, + description="Enable NUMA support.", + ) cache: bool = Field( default=False, description="Use a cache to reduce processing times for evaluated prompts.",