Add numa support, low level api users must now explicitly call llama_backend_init at the start of their programs.
This commit is contained in:
parent
c999325e8e
commit
f4090a0bb2
5 changed files with 20 additions and 9 deletions
|
@ -180,6 +180,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize
|
||||||
```python
|
```python
|
||||||
>>> import llama_cpp
|
>>> import llama_cpp
|
||||||
>>> import ctypes
|
>>> import ctypes
|
||||||
|
>>> llama_cpp.llama_backend_init(numa=False) # Must be called once at the start of each program
|
||||||
>>> params = llama_cpp.llama_context_default_params()
|
>>> params = llama_cpp.llama_context_default_params()
|
||||||
# use bytes for char * params
|
# use bytes for char * params
|
||||||
>>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params)
|
>>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/ggml-model.bin", params)
|
||||||
|
|
|
@ -4,6 +4,8 @@ import multiprocessing
|
||||||
|
|
||||||
import llama_cpp
|
import llama_cpp
|
||||||
|
|
||||||
|
llama_cpp.llama_backend_init(numa=False)
|
||||||
|
|
||||||
N_THREADS = multiprocessing.cpu_count()
|
N_THREADS = multiprocessing.cpu_count()
|
||||||
MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin")
|
MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin")
|
||||||
|
|
||||||
|
|
|
@ -209,6 +209,8 @@ class StoppingCriteriaList(List[StoppingCriteria]):
|
||||||
class Llama:
|
class Llama:
|
||||||
"""High-level Python wrapper for a llama.cpp model."""
|
"""High-level Python wrapper for a llama.cpp model."""
|
||||||
|
|
||||||
|
__backend_initialized = False
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_path: str,
|
model_path: str,
|
||||||
|
@ -234,6 +236,7 @@ class Llama:
|
||||||
last_n_tokens_size: int = 64,
|
last_n_tokens_size: int = 64,
|
||||||
lora_base: Optional[str] = None,
|
lora_base: Optional[str] = None,
|
||||||
lora_path: Optional[str] = None,
|
lora_path: Optional[str] = None,
|
||||||
|
numa: bool = False,
|
||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
**kwargs # type: ignore
|
**kwargs # type: ignore
|
||||||
):
|
):
|
||||||
|
@ -261,6 +264,7 @@ class Llama:
|
||||||
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
||||||
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
|
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
|
||||||
lora_path: Path to a LoRA file to apply to the model.
|
lora_path: Path to a LoRA file to apply to the model.
|
||||||
|
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
|
||||||
verbose: Print verbose output to stderr.
|
verbose: Print verbose output to stderr.
|
||||||
kwargs: Unused keyword arguments (for additional backwards compatibility).
|
kwargs: Unused keyword arguments (for additional backwards compatibility).
|
||||||
|
|
||||||
|
@ -272,6 +276,15 @@ class Llama:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
|
|
||||||
|
if not Llama.__backend_initialized:
|
||||||
|
if self.verbose:
|
||||||
|
llama_cpp.llama_backend_init(numa)
|
||||||
|
else:
|
||||||
|
with suppress_stdout_stderr():
|
||||||
|
llama_cpp.llama_backend_init(numa)
|
||||||
|
Llama.__backend_initialized = True
|
||||||
|
|
||||||
self.model_path = model_path
|
self.model_path = model_path
|
||||||
|
|
||||||
self.params = llama_cpp.llama_context_default_params()
|
self.params = llama_cpp.llama_context_default_params()
|
||||||
|
|
|
@ -1524,12 +1524,3 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
|
||||||
_lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
|
_lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
|
||||||
_lib.llama_dump_timing_info_yaml.restype = None
|
_lib.llama_dump_timing_info_yaml.restype = None
|
||||||
|
|
||||||
|
|
||||||
###################################################################################################
|
|
||||||
|
|
||||||
|
|
||||||
_llama_initialized = False
|
|
||||||
|
|
||||||
if not _llama_initialized:
|
|
||||||
llama_backend_init(False)
|
|
||||||
_llama_initialized = True
|
|
||||||
|
|
|
@ -98,6 +98,10 @@ class Settings(BaseSettings):
|
||||||
default=None,
|
default=None,
|
||||||
description="Path to a LoRA file to apply to the model.",
|
description="Path to a LoRA file to apply to the model.",
|
||||||
)
|
)
|
||||||
|
numa: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="Enable NUMA support.",
|
||||||
|
)
|
||||||
cache: bool = Field(
|
cache: bool = Field(
|
||||||
default=False,
|
default=False,
|
||||||
description="Use a cache to reduce processing times for evaluated prompts.",
|
description="Use a cache to reduce processing times for evaluated prompts.",
|
||||||
|
|
Loading…
Reference in a new issue