diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 46a9aeb..a6f1e76 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -219,6 +219,7 @@ class Llama: last_n_tokens_size: int = 64, lora_base: Optional[str] = None, lora_path: Optional[str] = None, + low_vram: bool = False, verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -260,6 +261,7 @@ class Llama: self.params.use_mmap = use_mmap if lora_path is None else False self.params.use_mlock = use_mlock self.params.embedding = embedding + self.params.low_vram = low_vram self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) @@ -1447,6 +1449,7 @@ class Llama: use_mmap=self.params.use_mmap, use_mlock=self.params.use_mlock, embedding=self.params.embedding, + low_vram=self.params.low_vram, last_n_tokens_size=self.last_n_tokens_size, n_batch=self.n_batch, n_threads=self.n_threads, @@ -1470,6 +1473,7 @@ class Llama: use_mmap=state["use_mmap"], use_mlock=state["use_mlock"], embedding=state["embedding"], + low_vram=state["low_vram"], n_threads=state["n_threads"], n_batch=state["n_batch"], last_n_tokens_size=state["last_n_tokens_size"],