320a5d7ea5
* feat: add explicit methods to free model This commit introduces a `close` method to both `Llama` and `_LlamaModel`, allowing users to explicitly free the model from RAM/VRAM. The previous implementation relied on the destructor of `_LlamaModel` to free the model. However, in Python, the timing of destructor calls is unclear—for instance, the `del` statement does not guarantee immediate invocation of the destructor. This commit provides an explicit method to release the model, which works immediately and allows the user to load another model without memory issues. Additionally, this commit implements a context manager in the `Llama` class, enabling the automatic closure of the `Llama` object when used with the `with` statement. * feat: Implement ContextManager in _LlamaModel, _LlamaContext, and _LlamaBatch This commit enables automatic resource management by implementing the `ContextManager` protocol in `_LlamaModel`, `_LlamaContext`, and `_LlamaBatch`. This ensures that resources are properly managed and released within a `with` statement, enhancing robustness and safety in resource handling. * feat: add ExitStack for Llama's internal class closure This update implements ExitStack to manage and close internal classes in Llama, enhancing efficient and safe resource management. * Use contextlib ExitStack and closing * Explicitly free model when closing resources on server --------- Co-authored-by: Andrei Betlen <abetlen@gmail.com>
282 lines
12 KiB
Python
282 lines
12 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
|
|
from typing import Dict, Optional, Union, List
|
|
|
|
import llama_cpp
|
|
import llama_cpp.llama_speculative as llama_speculative
|
|
import llama_cpp.llama_tokenizer as llama_tokenizer
|
|
|
|
from llama_cpp.server.settings import ModelSettings
|
|
|
|
|
|
class LlamaProxy:
|
|
def __init__(self, models: List[ModelSettings]) -> None:
|
|
assert len(models) > 0, "No models provided!"
|
|
|
|
self._model_settings_dict: dict[str, ModelSettings] = {}
|
|
for model in models:
|
|
if not model.model_alias:
|
|
model.model_alias = model.model
|
|
self._model_settings_dict[model.model_alias] = model
|
|
|
|
self._current_model: Optional[llama_cpp.Llama] = None
|
|
self._current_model_alias: Optional[str] = None
|
|
|
|
self._default_model_settings: ModelSettings = models[0]
|
|
self._default_model_alias: str = self._default_model_settings.model_alias # type: ignore
|
|
|
|
# Load default model
|
|
self._current_model = self.load_llama_from_model_settings(
|
|
self._default_model_settings
|
|
)
|
|
self._current_model_alias = self._default_model_alias
|
|
|
|
def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
|
|
if model is None:
|
|
model = self._default_model_alias
|
|
|
|
if model not in self._model_settings_dict:
|
|
model = self._default_model_alias
|
|
|
|
if model == self._current_model_alias:
|
|
if self._current_model is not None:
|
|
return self._current_model
|
|
|
|
if self._current_model:
|
|
self._current_model.close()
|
|
self._current_model = None
|
|
|
|
settings = self._model_settings_dict[model]
|
|
self._current_model = self.load_llama_from_model_settings(settings)
|
|
self._current_model_alias = model
|
|
return self._current_model
|
|
|
|
def __getitem__(self, model: str):
|
|
return self._model_settings_dict[model].model_dump()
|
|
|
|
def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]):
|
|
if isinstance(settings, (bytes, str)):
|
|
settings = ModelSettings.model_validate_json(settings)
|
|
self._model_settings_dict[model] = settings
|
|
|
|
def __iter__(self):
|
|
for model in self._model_settings_dict:
|
|
yield model
|
|
|
|
def free(self):
|
|
if self._current_model:
|
|
self._current_model.close()
|
|
del self._current_model
|
|
|
|
@staticmethod
|
|
def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
|
|
chat_handler = None
|
|
if settings.chat_format == "llava-1-5":
|
|
assert settings.clip_model_path is not None, "clip model not found"
|
|
if settings.hf_model_repo_id is not None:
|
|
chat_handler = (
|
|
llama_cpp.llama_chat_format.Llava15ChatHandler.from_pretrained(
|
|
repo_id=settings.hf_model_repo_id,
|
|
filename=settings.clip_model_path,
|
|
verbose=settings.verbose,
|
|
)
|
|
)
|
|
else:
|
|
chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
|
|
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
)
|
|
elif settings.chat_format == "obsidian":
|
|
assert settings.clip_model_path is not None, "clip model not found"
|
|
if settings.hf_model_repo_id is not None:
|
|
chat_handler = (
|
|
llama_cpp.llama_chat_format.ObsidianChatHandler.from_pretrained(
|
|
repo_id=settings.hf_model_repo_id,
|
|
filename=settings.clip_model_path,
|
|
verbose=settings.verbose,
|
|
)
|
|
)
|
|
else:
|
|
chat_handler = llama_cpp.llama_chat_format.ObsidianChatHandler(
|
|
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
)
|
|
elif settings.chat_format == "llava-1-6":
|
|
assert settings.clip_model_path is not None, "clip model not found"
|
|
if settings.hf_model_repo_id is not None:
|
|
chat_handler = (
|
|
llama_cpp.llama_chat_format.Llava16ChatHandler.from_pretrained(
|
|
repo_id=settings.hf_model_repo_id,
|
|
filename=settings.clip_model_path,
|
|
verbose=settings.verbose,
|
|
)
|
|
)
|
|
else:
|
|
chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler(
|
|
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
)
|
|
elif settings.chat_format == "moondream":
|
|
assert settings.clip_model_path is not None, "clip model not found"
|
|
if settings.hf_model_repo_id is not None:
|
|
chat_handler = (
|
|
llama_cpp.llama_chat_format.MoondreamChatHandler.from_pretrained(
|
|
repo_id=settings.hf_model_repo_id,
|
|
filename=settings.clip_model_path,
|
|
verbose=settings.verbose,
|
|
)
|
|
)
|
|
else:
|
|
chat_handler = llama_cpp.llama_chat_format.MoondreamChatHandler(
|
|
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
)
|
|
elif settings.chat_format == "nanollava":
|
|
assert settings.clip_model_path is not None, "clip model not found"
|
|
if settings.hf_model_repo_id is not None:
|
|
chat_handler = (
|
|
llama_cpp.llama_chat_format.NanoLlavaChatHandler.from_pretrained(
|
|
repo_id=settings.hf_model_repo_id,
|
|
filename=settings.clip_model_path,
|
|
verbose=settings.verbose,
|
|
)
|
|
)
|
|
else:
|
|
chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
|
|
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
)
|
|
elif settings.chat_format == "llama-3-vision-alpha":
|
|
assert settings.clip_model_path is not None, "clip model not found"
|
|
if settings.hf_model_repo_id is not None:
|
|
chat_handler = (
|
|
llama_cpp.llama_chat_format.Llama3VisionAlpha.from_pretrained(
|
|
repo_id=settings.hf_model_repo_id,
|
|
filename=settings.clip_model_path,
|
|
verbose=settings.verbose,
|
|
)
|
|
)
|
|
else:
|
|
chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha(
|
|
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
)
|
|
elif settings.chat_format == "hf-autotokenizer":
|
|
assert (
|
|
settings.hf_pretrained_model_name_or_path is not None
|
|
), "hf_pretrained_model_name_or_path must be set for hf-autotokenizer"
|
|
chat_handler = (
|
|
llama_cpp.llama_chat_format.hf_autotokenizer_to_chat_completion_handler(
|
|
settings.hf_pretrained_model_name_or_path
|
|
)
|
|
)
|
|
elif settings.chat_format == "hf-tokenizer-config":
|
|
assert (
|
|
settings.hf_tokenizer_config_path is not None
|
|
), "hf_tokenizer_config_path must be set for hf-tokenizer-config"
|
|
chat_handler = llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler(
|
|
json.load(open(settings.hf_tokenizer_config_path))
|
|
)
|
|
|
|
tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
|
|
if settings.hf_pretrained_model_name_or_path is not None:
|
|
tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(
|
|
settings.hf_pretrained_model_name_or_path
|
|
)
|
|
|
|
draft_model = None
|
|
if settings.draft_model is not None:
|
|
draft_model = llama_speculative.LlamaPromptLookupDecoding(
|
|
num_pred_tokens=settings.draft_model_num_pred_tokens
|
|
)
|
|
|
|
kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None
|
|
if settings.kv_overrides is not None:
|
|
assert isinstance(settings.kv_overrides, list)
|
|
kv_overrides = {}
|
|
for kv in settings.kv_overrides:
|
|
key, value = kv.split("=")
|
|
if ":" in value:
|
|
value_type, value = value.split(":")
|
|
if value_type == "bool":
|
|
kv_overrides[key] = value.lower() in ["true", "1"]
|
|
elif value_type == "int":
|
|
kv_overrides[key] = int(value)
|
|
elif value_type == "float":
|
|
kv_overrides[key] = float(value)
|
|
elif value_type == "str":
|
|
kv_overrides[key] = value
|
|
else:
|
|
raise ValueError(f"Unknown value type {value_type}")
|
|
|
|
import functools
|
|
|
|
kwargs = {}
|
|
|
|
if settings.hf_model_repo_id is not None:
|
|
create_fn = functools.partial(
|
|
llama_cpp.Llama.from_pretrained,
|
|
repo_id=settings.hf_model_repo_id,
|
|
filename=settings.model,
|
|
)
|
|
else:
|
|
create_fn = llama_cpp.Llama
|
|
kwargs["model_path"] = settings.model
|
|
|
|
_model = create_fn(
|
|
**kwargs,
|
|
# Model Params
|
|
n_gpu_layers=settings.n_gpu_layers,
|
|
main_gpu=settings.main_gpu,
|
|
tensor_split=settings.tensor_split,
|
|
vocab_only=settings.vocab_only,
|
|
use_mmap=settings.use_mmap,
|
|
use_mlock=settings.use_mlock,
|
|
kv_overrides=kv_overrides,
|
|
rpc_servers=settings.rpc_servers,
|
|
# Context Params
|
|
seed=settings.seed,
|
|
n_ctx=settings.n_ctx,
|
|
n_batch=settings.n_batch,
|
|
n_threads=settings.n_threads,
|
|
n_threads_batch=settings.n_threads_batch,
|
|
rope_scaling_type=settings.rope_scaling_type,
|
|
rope_freq_base=settings.rope_freq_base,
|
|
rope_freq_scale=settings.rope_freq_scale,
|
|
yarn_ext_factor=settings.yarn_ext_factor,
|
|
yarn_attn_factor=settings.yarn_attn_factor,
|
|
yarn_beta_fast=settings.yarn_beta_fast,
|
|
yarn_beta_slow=settings.yarn_beta_slow,
|
|
yarn_orig_ctx=settings.yarn_orig_ctx,
|
|
mul_mat_q=settings.mul_mat_q,
|
|
logits_all=settings.logits_all,
|
|
embedding=settings.embedding,
|
|
offload_kqv=settings.offload_kqv,
|
|
flash_attn=settings.flash_attn,
|
|
# Sampling Params
|
|
last_n_tokens_size=settings.last_n_tokens_size,
|
|
# LoRA Params
|
|
lora_base=settings.lora_base,
|
|
lora_path=settings.lora_path,
|
|
# Backend Params
|
|
numa=settings.numa,
|
|
# Chat Format Params
|
|
chat_format=settings.chat_format,
|
|
chat_handler=chat_handler,
|
|
# Speculative Decoding
|
|
draft_model=draft_model,
|
|
# KV Cache Quantization
|
|
type_k=settings.type_k,
|
|
type_v=settings.type_v,
|
|
# Tokenizer
|
|
tokenizer=tokenizer,
|
|
# Misc
|
|
verbose=settings.verbose,
|
|
)
|
|
if settings.cache:
|
|
if settings.cache_type == "disk":
|
|
if settings.verbose:
|
|
print(f"Using disk cache with size {settings.cache_size}")
|
|
cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
|
|
else:
|
|
if settings.verbose:
|
|
print(f"Using ram cache with size {settings.cache_size}")
|
|
cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
|
|
_model.set_cache(cache)
|
|
return _model
|