Add numa support, low level api users must now explicitly call llama_backend_init at the start of their programs.

2023-09-13 23:00:43 -04:00 · 2023-09-13 23:00:43 -04:00 · f4090a0bb2
commit f4090a0bb2
parent c999325e8e
5 changed files with 20 additions and 9 deletions
--- a/README.md
+++ b/README.md
@ -180,6 +180,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize
 ```python
 >>> import llama_cpp
 >>> import ctypes
 >>> llama_cpp.llama_backend_init(numa=False) # Must be called once at the start of each program
 >>> params = llama_cpp.llama_context_default_params()
 # use bytes for char * params
 >>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params)
--- a/examples/low_level_api/low_level_api_llama_cpp.py
+++ b/examples/low_level_api/low_level_api_llama_cpp.py
@ -4,6 +4,8 @@ import multiprocessing
 import llama_cpp
 llama_cpp.llama_backend_init(numa=False)
 N_THREADS = multiprocessing.cpu_count()
 MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin")
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -209,6 +209,8 @@ class StoppingCriteriaList(List[StoppingCriteria]):
 class Llama:
    """High-level Python wrapper for a llama.cpp model."""
    __backend_initialized = False
    def __init__(
        self,
        model_path: str,
@ -234,6 +236,7 @@ class Llama:
        last_n_tokens_size: int = 64,
        lora_base: Optional[str] = None,
        lora_path: Optional[str] = None,
        numa: bool = False,
        verbose: bool = True,
        **kwargs # type: ignore
    ):
@ -261,6 +264,7 @@ class Llama:
            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
            lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
            lora_path: Path to a LoRA file to apply to the model.
            numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
            verbose: Print verbose output to stderr.
            kwargs: Unused keyword arguments (for additional backwards compatibility).
@ -272,6 +276,15 @@ class Llama:
        """
        self.verbose = verbose
        if not Llama.__backend_initialized:
            if self.verbose:
                llama_cpp.llama_backend_init(numa)
            else:
                with suppress_stdout_stderr():
                    llama_cpp.llama_backend_init(numa)
            Llama.__backend_initialized = True
        self.model_path = model_path
        self.params = llama_cpp.llama_context_default_params()
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -1524,12 +1524,3 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
 _lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
 _lib.llama_dump_timing_info_yaml.restype = None
 ###################################################################################################
 _llama_initialized = False
 if not _llama_initialized:
    llama_backend_init(False)
    _llama_initialized = True
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@ -98,6 +98,10 @@ class Settings(BaseSettings):
        default=None,
        description="Path to a LoRA file to apply to the model.",
    )
    numa: bool = Field(
        default=False,
        description="Enable NUMA support.",
    )
    cache: bool = Field(
        default=False,
        description="Use a cache to reduce processing times for evaluated prompts.",