feat: Update llama.cpp

This commit is contained in:
Andrei Betlen 2024-02-17 00:37:51 -05:00
parent c2a234a086
commit fdce078cb9
4 changed files with 44 additions and 10 deletions

View file

@ -98,7 +98,7 @@ class Llama:
lora_scale: float = 1.0, lora_scale: float = 1.0,
lora_path: Optional[str] = None, lora_path: Optional[str] = None,
# Backend Params # Backend Params
numa: bool = False, numa: Union[bool, int] = False,
# Chat Format Params # Chat Format Params
chat_format: Optional[str] = None, chat_format: Optional[str] = None,
chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None, chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
@ -166,7 +166,7 @@ class Llama:
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
lora_path: Path to a LoRA file to apply to the model. lora_path: Path to a LoRA file to apply to the model.
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init) numa: numa policy
chat_format: String specifying the chat format to use when calling create_chat_completion. chat_format: String specifying the chat format to use when calling create_chat_completion.
chat_handler: Optional chat handler to use when calling create_chat_completion. chat_handler: Optional chat handler to use when calling create_chat_completion.
draft_model: Optional draft model to use for speculative decoding. draft_model: Optional draft model to use for speculative decoding.
@ -183,12 +183,18 @@ class Llama:
set_verbose(verbose) set_verbose(verbose)
self.numa = numa
if not Llama.__backend_initialized: if not Llama.__backend_initialized:
with suppress_stdout_stderr(disable=verbose): with suppress_stdout_stderr(disable=verbose):
llama_cpp.llama_backend_init(self.numa) llama_cpp.llama_backend_init()
Llama.__backend_initialized = True Llama.__backend_initialized = True
if isinstance(numa, bool):
self.numa = llama_cpp.GGML_NUMA_STRATEGY_DISTRIBUTE if numa else llama_cpp.GGML_NUMA_STRATEGY_DISABLED
if self.numa != llama_cpp.GGML_NUMA_STRATEGY_DISABLED:
with suppress_stdout_stderr(disable=verbose):
llama_cpp.llama_numa_init(self.numa)
self.model_path = model_path self.model_path = model_path
# Model Params # Model Params

View file

@ -697,17 +697,45 @@ _lib.llama_model_quantize_default_params.restype = llama_model_quantize_params
# // If numa is true, use NUMA optimizations # // If numa is true, use NUMA optimizations
# // Call once at the start of the program # // Call once at the start of the program
# LLAMA_API void llama_backend_init(bool numa); # LLAMA_API void llama_backend_init(bool numa);
def llama_backend_init(numa: Union[c_bool, bool]): # LLAMA_API void llama_backend_init(void);
def llama_backend_init():
"""Initialize the llama + ggml backend """Initialize the llama + ggml backend
If numa is true, use NUMA optimizations If numa is true, use NUMA optimizations
Call once at the start of the program""" Call once at the start of the program"""
return _lib.llama_backend_init(numa) return _lib.llama_backend_init()
_lib.llama_backend_init.argtypes = [c_bool] _lib.llama_backend_init.argtypes = []
_lib.llama_backend_init.restype = None _lib.llama_backend_init.restype = None
# // numa strategies
# enum ggml_numa_strategy {
# GGML_NUMA_STRATEGY_DISABLED = 0,
# GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
# GGML_NUMA_STRATEGY_ISOLATE = 2,
# GGML_NUMA_STRATEGY_NUMACTL = 3,
# GGML_NUMA_STRATEGY_MIRROR = 4,
# GGML_NUMA_STRATEGY_COUNT
# };
GGML_NUMA_STRATEGY_DISABLED = 0
GGML_NUMA_STRATEGY_DISTRIBUTE = 1
GGML_NUMA_STRATEGY_ISOLATE = 2
GGML_NUMA_STRATEGY_NUMACTL = 3
GGML_NUMA_STRATEGY_MIRROR = 4
GGML_NUMA_STRATEGY_COUNT = 5
# //optional:
# LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
def llama_numa_init(numa: int):
return _lib.llama_numa_init(numa)
_lib.llama_numa_init.argtypes = [c_int]
_lib.llama_numa_init.restype = None
# // Call once at the end of the program - currently only used for MPI # // Call once at the end of the program - currently only used for MPI
# LLAMA_API void llama_backend_free(void); # LLAMA_API void llama_backend_free(void);
def llama_backend_free(): def llama_backend_free():

View file

@ -2,7 +2,7 @@ from __future__ import annotations
import multiprocessing import multiprocessing
from typing import Optional, List, Literal from typing import Optional, List, Literal, Union
from pydantic import Field from pydantic import Field
from pydantic_settings import BaseSettings from pydantic_settings import BaseSettings
@ -108,7 +108,7 @@ class ModelSettings(BaseSettings):
description="Path to a LoRA file to apply to the model.", description="Path to a LoRA file to apply to the model.",
) )
# Backend Params # Backend Params
numa: bool = Field( numa: Union[bool, int] = Field(
default=False, default=False,
description="Enable NUMA support.", description="Enable NUMA support.",
) )

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 4524290e87b8e107cc2b56e1251751546f4b9051 Subproject commit 5bf2b94dd4fb74378b78604023b31512fec55f8f