feat: Update llama.cpp
This commit is contained in:
parent
c2a234a086
commit
fdce078cb9
4 changed files with 44 additions and 10 deletions
|
@ -98,7 +98,7 @@ class Llama:
|
||||||
lora_scale: float = 1.0,
|
lora_scale: float = 1.0,
|
||||||
lora_path: Optional[str] = None,
|
lora_path: Optional[str] = None,
|
||||||
# Backend Params
|
# Backend Params
|
||||||
numa: bool = False,
|
numa: Union[bool, int] = False,
|
||||||
# Chat Format Params
|
# Chat Format Params
|
||||||
chat_format: Optional[str] = None,
|
chat_format: Optional[str] = None,
|
||||||
chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
|
chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
|
||||||
|
@ -166,7 +166,7 @@ class Llama:
|
||||||
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
||||||
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
|
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
|
||||||
lora_path: Path to a LoRA file to apply to the model.
|
lora_path: Path to a LoRA file to apply to the model.
|
||||||
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
|
numa: numa policy
|
||||||
chat_format: String specifying the chat format to use when calling create_chat_completion.
|
chat_format: String specifying the chat format to use when calling create_chat_completion.
|
||||||
chat_handler: Optional chat handler to use when calling create_chat_completion.
|
chat_handler: Optional chat handler to use when calling create_chat_completion.
|
||||||
draft_model: Optional draft model to use for speculative decoding.
|
draft_model: Optional draft model to use for speculative decoding.
|
||||||
|
@ -183,12 +183,18 @@ class Llama:
|
||||||
|
|
||||||
set_verbose(verbose)
|
set_verbose(verbose)
|
||||||
|
|
||||||
self.numa = numa
|
|
||||||
if not Llama.__backend_initialized:
|
if not Llama.__backend_initialized:
|
||||||
with suppress_stdout_stderr(disable=verbose):
|
with suppress_stdout_stderr(disable=verbose):
|
||||||
llama_cpp.llama_backend_init(self.numa)
|
llama_cpp.llama_backend_init()
|
||||||
Llama.__backend_initialized = True
|
Llama.__backend_initialized = True
|
||||||
|
|
||||||
|
if isinstance(numa, bool):
|
||||||
|
self.numa = llama_cpp.GGML_NUMA_STRATEGY_DISTRIBUTE if numa else llama_cpp.GGML_NUMA_STRATEGY_DISABLED
|
||||||
|
|
||||||
|
if self.numa != llama_cpp.GGML_NUMA_STRATEGY_DISABLED:
|
||||||
|
with suppress_stdout_stderr(disable=verbose):
|
||||||
|
llama_cpp.llama_numa_init(self.numa)
|
||||||
|
|
||||||
self.model_path = model_path
|
self.model_path = model_path
|
||||||
|
|
||||||
# Model Params
|
# Model Params
|
||||||
|
|
|
@ -697,17 +697,45 @@ _lib.llama_model_quantize_default_params.restype = llama_model_quantize_params
|
||||||
# // If numa is true, use NUMA optimizations
|
# // If numa is true, use NUMA optimizations
|
||||||
# // Call once at the start of the program
|
# // Call once at the start of the program
|
||||||
# LLAMA_API void llama_backend_init(bool numa);
|
# LLAMA_API void llama_backend_init(bool numa);
|
||||||
def llama_backend_init(numa: Union[c_bool, bool]):
|
# LLAMA_API void llama_backend_init(void);
|
||||||
|
def llama_backend_init():
|
||||||
"""Initialize the llama + ggml backend
|
"""Initialize the llama + ggml backend
|
||||||
If numa is true, use NUMA optimizations
|
If numa is true, use NUMA optimizations
|
||||||
Call once at the start of the program"""
|
Call once at the start of the program"""
|
||||||
return _lib.llama_backend_init(numa)
|
return _lib.llama_backend_init()
|
||||||
|
|
||||||
|
|
||||||
_lib.llama_backend_init.argtypes = [c_bool]
|
_lib.llama_backend_init.argtypes = []
|
||||||
_lib.llama_backend_init.restype = None
|
_lib.llama_backend_init.restype = None
|
||||||
|
|
||||||
|
|
||||||
|
# // numa strategies
|
||||||
|
# enum ggml_numa_strategy {
|
||||||
|
# GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||||
|
# GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||||
|
# GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||||
|
# GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||||
|
# GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||||
|
# GGML_NUMA_STRATEGY_COUNT
|
||||||
|
# };
|
||||||
|
GGML_NUMA_STRATEGY_DISABLED = 0
|
||||||
|
GGML_NUMA_STRATEGY_DISTRIBUTE = 1
|
||||||
|
GGML_NUMA_STRATEGY_ISOLATE = 2
|
||||||
|
GGML_NUMA_STRATEGY_NUMACTL = 3
|
||||||
|
GGML_NUMA_STRATEGY_MIRROR = 4
|
||||||
|
GGML_NUMA_STRATEGY_COUNT = 5
|
||||||
|
|
||||||
|
|
||||||
|
# //optional:
|
||||||
|
# LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
||||||
|
def llama_numa_init(numa: int):
|
||||||
|
return _lib.llama_numa_init(numa)
|
||||||
|
|
||||||
|
|
||||||
|
_lib.llama_numa_init.argtypes = [c_int]
|
||||||
|
_lib.llama_numa_init.restype = None
|
||||||
|
|
||||||
|
|
||||||
# // Call once at the end of the program - currently only used for MPI
|
# // Call once at the end of the program - currently only used for MPI
|
||||||
# LLAMA_API void llama_backend_free(void);
|
# LLAMA_API void llama_backend_free(void);
|
||||||
def llama_backend_free():
|
def llama_backend_free():
|
||||||
|
|
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||||
|
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
|
||||||
from typing import Optional, List, Literal
|
from typing import Optional, List, Literal, Union
|
||||||
from pydantic import Field
|
from pydantic import Field
|
||||||
from pydantic_settings import BaseSettings
|
from pydantic_settings import BaseSettings
|
||||||
|
|
||||||
|
@ -108,7 +108,7 @@ class ModelSettings(BaseSettings):
|
||||||
description="Path to a LoRA file to apply to the model.",
|
description="Path to a LoRA file to apply to the model.",
|
||||||
)
|
)
|
||||||
# Backend Params
|
# Backend Params
|
||||||
numa: bool = Field(
|
numa: Union[bool, int] = Field(
|
||||||
default=False,
|
default=False,
|
||||||
description="Enable NUMA support.",
|
description="Enable NUMA support.",
|
||||||
)
|
)
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 4524290e87b8e107cc2b56e1251751546f4b9051
|
Subproject commit 5bf2b94dd4fb74378b78604023b31512fec55f8f
|
Loading…
Reference in a new issue