From fdce078cb915cecba7df58eeb494ee8493c87672 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 17 Feb 2024 00:37:51 -0500 Subject: [PATCH] feat: Update llama.cpp --- llama_cpp/llama.py | 14 ++++++++++---- llama_cpp/llama_cpp.py | 34 +++++++++++++++++++++++++++++++--- llama_cpp/server/settings.py | 4 ++-- vendor/llama.cpp | 2 +- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 964b0c8..71f968f 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -98,7 +98,7 @@ class Llama: lora_scale: float = 1.0, lora_path: Optional[str] = None, # Backend Params - numa: bool = False, + numa: Union[bool, int] = False, # Chat Format Params chat_format: Optional[str] = None, chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None, @@ -166,7 +166,7 @@ class Llama: last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. - numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init) + numa: numa policy chat_format: String specifying the chat format to use when calling create_chat_completion. chat_handler: Optional chat handler to use when calling create_chat_completion. draft_model: Optional draft model to use for speculative decoding. @@ -183,12 +183,18 @@ class Llama: set_verbose(verbose) - self.numa = numa if not Llama.__backend_initialized: with suppress_stdout_stderr(disable=verbose): - llama_cpp.llama_backend_init(self.numa) + llama_cpp.llama_backend_init() Llama.__backend_initialized = True + if isinstance(numa, bool): + self.numa = llama_cpp.GGML_NUMA_STRATEGY_DISTRIBUTE if numa else llama_cpp.GGML_NUMA_STRATEGY_DISABLED + + if self.numa != llama_cpp.GGML_NUMA_STRATEGY_DISABLED: + with suppress_stdout_stderr(disable=verbose): + llama_cpp.llama_numa_init(self.numa) + self.model_path = model_path # Model Params diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 13daadb..2087037 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -697,17 +697,45 @@ _lib.llama_model_quantize_default_params.restype = llama_model_quantize_params # // If numa is true, use NUMA optimizations # // Call once at the start of the program # LLAMA_API void llama_backend_init(bool numa); -def llama_backend_init(numa: Union[c_bool, bool]): +# LLAMA_API void llama_backend_init(void); +def llama_backend_init(): """Initialize the llama + ggml backend If numa is true, use NUMA optimizations Call once at the start of the program""" - return _lib.llama_backend_init(numa) + return _lib.llama_backend_init() -_lib.llama_backend_init.argtypes = [c_bool] +_lib.llama_backend_init.argtypes = [] _lib.llama_backend_init.restype = None +# // numa strategies +# enum ggml_numa_strategy { +# GGML_NUMA_STRATEGY_DISABLED = 0, +# GGML_NUMA_STRATEGY_DISTRIBUTE = 1, +# GGML_NUMA_STRATEGY_ISOLATE = 2, +# GGML_NUMA_STRATEGY_NUMACTL = 3, +# GGML_NUMA_STRATEGY_MIRROR = 4, +# GGML_NUMA_STRATEGY_COUNT +# }; +GGML_NUMA_STRATEGY_DISABLED = 0 +GGML_NUMA_STRATEGY_DISTRIBUTE = 1 +GGML_NUMA_STRATEGY_ISOLATE = 2 +GGML_NUMA_STRATEGY_NUMACTL = 3 +GGML_NUMA_STRATEGY_MIRROR = 4 +GGML_NUMA_STRATEGY_COUNT = 5 + + +# //optional: +# LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa); +def llama_numa_init(numa: int): + return _lib.llama_numa_init(numa) + + +_lib.llama_numa_init.argtypes = [c_int] +_lib.llama_numa_init.restype = None + + # // Call once at the end of the program - currently only used for MPI # LLAMA_API void llama_backend_free(void); def llama_backend_free(): diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 60f3eec..790c6b1 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -2,7 +2,7 @@ from __future__ import annotations import multiprocessing -from typing import Optional, List, Literal +from typing import Optional, List, Literal, Union from pydantic import Field from pydantic_settings import BaseSettings @@ -108,7 +108,7 @@ class ModelSettings(BaseSettings): description="Path to a LoRA file to apply to the model.", ) # Backend Params - numa: bool = Field( + numa: Union[bool, int] = Field( default=False, description="Enable NUMA support.", ) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4524290..5bf2b94 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4524290e87b8e107cc2b56e1251751546f4b9051 +Subproject commit 5bf2b94dd4fb74378b78604023b31512fec55f8f