Add numa support, low level api users must now explicitly call llama_backend_init at the start of their programs.

This commit is contained in:
Andrei Betlen 2023-09-13 23:00:43 -04:00
parent c999325e8e
commit f4090a0bb2
5 changed files with 20 additions and 9 deletions

View file

@ -180,6 +180,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize
```python
>>> import llama_cpp
>>> import ctypes
>>> llama_cpp.llama_backend_init(numa=False) # Must be called once at the start of each program
>>> params = llama_cpp.llama_context_default_params()
# use bytes for char * params
>>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params)

View file

@ -4,6 +4,8 @@ import multiprocessing
import llama_cpp
llama_cpp.llama_backend_init(numa=False)
N_THREADS = multiprocessing.cpu_count()
MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin")

View file

@ -209,6 +209,8 @@ class StoppingCriteriaList(List[StoppingCriteria]):
class Llama:
"""High-level Python wrapper for a llama.cpp model."""
__backend_initialized = False
def __init__(
self,
model_path: str,
@ -234,6 +236,7 @@ class Llama:
last_n_tokens_size: int = 64,
lora_base: Optional[str] = None,
lora_path: Optional[str] = None,
numa: bool = False,
verbose: bool = True,
**kwargs # type: ignore
):
@ -261,6 +264,7 @@ class Llama:
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
lora_path: Path to a LoRA file to apply to the model.
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
verbose: Print verbose output to stderr.
kwargs: Unused keyword arguments (for additional backwards compatibility).
@ -272,6 +276,15 @@ class Llama:
"""
self.verbose = verbose
if not Llama.__backend_initialized:
if self.verbose:
llama_cpp.llama_backend_init(numa)
else:
with suppress_stdout_stderr():
llama_cpp.llama_backend_init(numa)
Llama.__backend_initialized = True
self.model_path = model_path
self.params = llama_cpp.llama_context_default_params()

View file

@ -1524,12 +1524,3 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
_lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
_lib.llama_dump_timing_info_yaml.restype = None
###################################################################################################
_llama_initialized = False
if not _llama_initialized:
llama_backend_init(False)
_llama_initialized = True

View file

@ -98,6 +98,10 @@ class Settings(BaseSettings):
default=None,
description="Path to a LoRA file to apply to the model.",
)
numa: bool = Field(
default=False,
description="Enable NUMA support.",
)
cache: bool = Field(
default=False,
description="Use a cache to reduce processing times for evaluated prompts.",