Add numa support, low level api users must now explicitly call llama_backend_init at the start of their programs.

This commit is contained in:
Andrei Betlen 2023-09-13 23:00:43 -04:00
parent c999325e8e
commit f4090a0bb2
5 changed files with 20 additions and 9 deletions

View file

@ -180,6 +180,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize
```python ```python
>>> import llama_cpp >>> import llama_cpp
>>> import ctypes >>> import ctypes
>>> llama_cpp.llama_backend_init(numa=False) # Must be called once at the start of each program
>>> params = llama_cpp.llama_context_default_params() >>> params = llama_cpp.llama_context_default_params()
# use bytes for char * params # use bytes for char * params
>>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params) >>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params)

View file

@ -4,6 +4,8 @@ import multiprocessing
import llama_cpp import llama_cpp
llama_cpp.llama_backend_init(numa=False)
N_THREADS = multiprocessing.cpu_count() N_THREADS = multiprocessing.cpu_count()
MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin") MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin")

View file

@ -209,6 +209,8 @@ class StoppingCriteriaList(List[StoppingCriteria]):
class Llama: class Llama:
"""High-level Python wrapper for a llama.cpp model.""" """High-level Python wrapper for a llama.cpp model."""
__backend_initialized = False
def __init__( def __init__(
self, self,
model_path: str, model_path: str,
@ -234,6 +236,7 @@ class Llama:
last_n_tokens_size: int = 64, last_n_tokens_size: int = 64,
lora_base: Optional[str] = None, lora_base: Optional[str] = None,
lora_path: Optional[str] = None, lora_path: Optional[str] = None,
numa: bool = False,
verbose: bool = True, verbose: bool = True,
**kwargs # type: ignore **kwargs # type: ignore
): ):
@ -261,6 +264,7 @@ class Llama:
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
lora_path: Path to a LoRA file to apply to the model. lora_path: Path to a LoRA file to apply to the model.
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
verbose: Print verbose output to stderr. verbose: Print verbose output to stderr.
kwargs: Unused keyword arguments (for additional backwards compatibility). kwargs: Unused keyword arguments (for additional backwards compatibility).
@ -272,6 +276,15 @@ class Llama:
""" """
self.verbose = verbose self.verbose = verbose
if not Llama.__backend_initialized:
if self.verbose:
llama_cpp.llama_backend_init(numa)
else:
with suppress_stdout_stderr():
llama_cpp.llama_backend_init(numa)
Llama.__backend_initialized = True
self.model_path = model_path self.model_path = model_path
self.params = llama_cpp.llama_context_default_params() self.params = llama_cpp.llama_context_default_params()

View file

@ -1524,12 +1524,3 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
_lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p] _lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
_lib.llama_dump_timing_info_yaml.restype = None _lib.llama_dump_timing_info_yaml.restype = None
###################################################################################################
_llama_initialized = False
if not _llama_initialized:
llama_backend_init(False)
_llama_initialized = True

View file

@ -98,6 +98,10 @@ class Settings(BaseSettings):
default=None, default=None,
description="Path to a LoRA file to apply to the model.", description="Path to a LoRA file to apply to the model.",
) )
numa: bool = Field(
default=False,
description="Enable NUMA support.",
)
cache: bool = Field( cache: bool = Field(
default=False, default=False,
description="Use a cache to reduce processing times for evaluated prompts.", description="Use a cache to reduce processing times for evaluated prompts.",