Add numa support, low level api users must now explicitly call llama_backend_init at the start of their programs.
This commit is contained in:
parent
c999325e8e
commit
f4090a0bb2
5 changed files with 20 additions and 9 deletions
|
@ -180,6 +180,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize
|
|||
```python
|
||||
>>> import llama_cpp
|
||||
>>> import ctypes
|
||||
>>> llama_cpp.llama_backend_init(numa=False) # Must be called once at the start of each program
|
||||
>>> params = llama_cpp.llama_context_default_params()
|
||||
# use bytes for char * params
|
||||
>>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params)
|
||||
|
|
|
@ -4,6 +4,8 @@ import multiprocessing
|
|||
|
||||
import llama_cpp
|
||||
|
||||
llama_cpp.llama_backend_init(numa=False)
|
||||
|
||||
N_THREADS = multiprocessing.cpu_count()
|
||||
MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin")
|
||||
|
||||
|
|
|
@ -209,6 +209,8 @@ class StoppingCriteriaList(List[StoppingCriteria]):
|
|||
class Llama:
|
||||
"""High-level Python wrapper for a llama.cpp model."""
|
||||
|
||||
__backend_initialized = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_path: str,
|
||||
|
@ -234,6 +236,7 @@ class Llama:
|
|||
last_n_tokens_size: int = 64,
|
||||
lora_base: Optional[str] = None,
|
||||
lora_path: Optional[str] = None,
|
||||
numa: bool = False,
|
||||
verbose: bool = True,
|
||||
**kwargs # type: ignore
|
||||
):
|
||||
|
@ -261,6 +264,7 @@ class Llama:
|
|||
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
||||
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
|
||||
lora_path: Path to a LoRA file to apply to the model.
|
||||
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
|
||||
verbose: Print verbose output to stderr.
|
||||
kwargs: Unused keyword arguments (for additional backwards compatibility).
|
||||
|
||||
|
@ -272,6 +276,15 @@ class Llama:
|
|||
"""
|
||||
|
||||
self.verbose = verbose
|
||||
|
||||
if not Llama.__backend_initialized:
|
||||
if self.verbose:
|
||||
llama_cpp.llama_backend_init(numa)
|
||||
else:
|
||||
with suppress_stdout_stderr():
|
||||
llama_cpp.llama_backend_init(numa)
|
||||
Llama.__backend_initialized = True
|
||||
|
||||
self.model_path = model_path
|
||||
|
||||
self.params = llama_cpp.llama_context_default_params()
|
||||
|
|
|
@ -1524,12 +1524,3 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
|
|||
_lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
|
||||
_lib.llama_dump_timing_info_yaml.restype = None
|
||||
|
||||
|
||||
###################################################################################################
|
||||
|
||||
|
||||
_llama_initialized = False
|
||||
|
||||
if not _llama_initialized:
|
||||
llama_backend_init(False)
|
||||
_llama_initialized = True
|
||||
|
|
|
@ -98,6 +98,10 @@ class Settings(BaseSettings):
|
|||
default=None,
|
||||
description="Path to a LoRA file to apply to the model.",
|
||||
)
|
||||
numa: bool = Field(
|
||||
default=False,
|
||||
description="Enable NUMA support.",
|
||||
)
|
||||
cache: bool = Field(
|
||||
default=False,
|
||||
description="Use a cache to reduce processing times for evaluated prompts.",
|
||||
|
|
Loading…
Reference in a new issue