Update llama.cpp
This commit is contained in:
parent
4177ae6d34
commit
1a1c3dc418
4 changed files with 742 additions and 356 deletions
|
@ -30,6 +30,7 @@ import numpy.typing as npt
|
||||||
|
|
||||||
from .utils import suppress_stdout_stderr
|
from .utils import suppress_stdout_stderr
|
||||||
|
|
||||||
|
|
||||||
class BaseLlamaCache(ABC):
|
class BaseLlamaCache(ABC):
|
||||||
"""Base cache class for a llama.cpp model."""
|
"""Base cache class for a llama.cpp model."""
|
||||||
|
|
||||||
|
@ -215,30 +216,37 @@ class Llama:
|
||||||
self,
|
self,
|
||||||
model_path: str,
|
model_path: str,
|
||||||
*,
|
*,
|
||||||
# NOTE: These parameters are likely to change in the future.
|
# Model Params
|
||||||
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
|
|
||||||
n_ctx: int = 512,
|
|
||||||
n_batch: int = 512,
|
|
||||||
n_gpu_layers: int = 0,
|
n_gpu_layers: int = 0,
|
||||||
main_gpu: int = 0,
|
main_gpu: int = 0,
|
||||||
tensor_split: Optional[List[float]] = None,
|
tensor_split: Optional[List[float]] = None,
|
||||||
rope_freq_base: float = 10000.0,
|
|
||||||
rope_freq_scale: float = 1.0,
|
|
||||||
low_vram: bool = False,
|
|
||||||
mul_mat_q: bool = True,
|
|
||||||
f16_kv: bool = True,
|
|
||||||
logits_all: bool = False,
|
|
||||||
vocab_only: bool = False,
|
vocab_only: bool = False,
|
||||||
use_mmap: bool = True,
|
use_mmap: bool = True,
|
||||||
use_mlock: bool = False,
|
use_mlock: bool = False,
|
||||||
embedding: bool = False,
|
# Context Params
|
||||||
|
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
|
||||||
|
n_ctx: int = 512,
|
||||||
|
n_batch: int = 512,
|
||||||
n_threads: Optional[int] = None,
|
n_threads: Optional[int] = None,
|
||||||
|
n_threads_batch: Optional[int] = None,
|
||||||
|
rope_freq_base: float = 10000.0,
|
||||||
|
rope_freq_scale: float = 1.0,
|
||||||
|
mul_mat_q: bool = True,
|
||||||
|
f16_kv: bool = True,
|
||||||
|
logits_all: bool = False,
|
||||||
|
embedding: bool = False,
|
||||||
|
# Sampling Params
|
||||||
last_n_tokens_size: int = 64,
|
last_n_tokens_size: int = 64,
|
||||||
|
# LoRA Params
|
||||||
lora_base: Optional[str] = None,
|
lora_base: Optional[str] = None,
|
||||||
|
lora_scale: float = 1.0,
|
||||||
lora_path: Optional[str] = None,
|
lora_path: Optional[str] = None,
|
||||||
|
# Backend Params
|
||||||
numa: bool = False,
|
numa: bool = False,
|
||||||
|
# Misc
|
||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
**kwargs # type: ignore
|
# Extra Params
|
||||||
|
**kwargs, # type: ignore
|
||||||
):
|
):
|
||||||
"""Load a llama.cpp model from `model_path`.
|
"""Load a llama.cpp model from `model_path`.
|
||||||
|
|
||||||
|
@ -277,52 +285,64 @@ class Llama:
|
||||||
|
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
|
|
||||||
|
self.numa = numa
|
||||||
if not Llama.__backend_initialized:
|
if not Llama.__backend_initialized:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
llama_cpp.llama_backend_init(numa)
|
llama_cpp.llama_backend_init(self.numa)
|
||||||
else:
|
else:
|
||||||
with suppress_stdout_stderr():
|
with suppress_stdout_stderr():
|
||||||
llama_cpp.llama_backend_init(numa)
|
llama_cpp.llama_backend_init(self.numa)
|
||||||
Llama.__backend_initialized = True
|
Llama.__backend_initialized = True
|
||||||
|
|
||||||
self.model_path = model_path
|
self.model_path = model_path
|
||||||
|
|
||||||
self.params = llama_cpp.llama_context_default_params()
|
# Model Params
|
||||||
self.params.seed = seed
|
self.model_params = llama_cpp.llama_model_default_params()
|
||||||
self.params.n_ctx = n_ctx
|
self.model_params.n_gpu_layers = (
|
||||||
self.params.n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers # 0x7FFFFFFF is INT32 max, will be auto set to all layers
|
0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
|
||||||
self.params.main_gpu = main_gpu
|
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
|
||||||
self.params.rope_freq_base = rope_freq_base
|
self.model_params.main_gpu = main_gpu
|
||||||
self.params.rope_freq_scale = rope_freq_scale
|
|
||||||
self.params.low_vram = low_vram
|
|
||||||
self.params.mul_mat_q = mul_mat_q
|
|
||||||
self.params.f16_kv = f16_kv
|
|
||||||
self.params.logits_all = logits_all
|
|
||||||
self.params.vocab_only = vocab_only
|
|
||||||
self.params.use_mmap = use_mmap if lora_path is None else False
|
|
||||||
self.params.use_mlock = use_mlock
|
|
||||||
self.params.embedding = embedding
|
|
||||||
|
|
||||||
self.tensor_split = tensor_split
|
self.tensor_split = tensor_split
|
||||||
self._p_tensor_split = None
|
self._p_tensor_split = None
|
||||||
|
|
||||||
if self.tensor_split is not None:
|
if self.tensor_split is not None:
|
||||||
# Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
|
# Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
|
||||||
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
|
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
|
||||||
self._c_tensor_split = FloatArray(
|
self._c_tensor_split = FloatArray(
|
||||||
*tensor_split
|
*tensor_split # type: ignore
|
||||||
) # keep a reference to the array so it is not gc'd
|
) # keep a reference to the array so it is not gc'd
|
||||||
self.params.tensor_split = self._c_tensor_split
|
self.model_params.tensor_split = self._c_tensor_split
|
||||||
|
self.model_params.vocab_only = vocab_only
|
||||||
|
self.model_params.use_mmap = use_mmap if lora_path is None else False
|
||||||
|
self.model_params.use_mlock = use_mlock
|
||||||
|
|
||||||
|
self.n_batch = min(n_ctx, n_batch) # ???
|
||||||
|
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
|
||||||
|
self.n_threads_batch = n_threads_batch or max(
|
||||||
|
multiprocessing.cpu_count() // 2, 1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Context Params
|
||||||
|
self.context_params = llama_cpp.llama_context_default_params()
|
||||||
|
self.context_params.seed = seed
|
||||||
|
self.context_params.n_ctx = n_ctx
|
||||||
|
self.context_params.n_batch = self.n_batch
|
||||||
|
self.context_params.n_threads = self.n_threads
|
||||||
|
self.context_params.n_threads_batch = self.n_threads_batch
|
||||||
|
self.context_params.rope_freq_base = rope_freq_base
|
||||||
|
self.context_params.rope_freq_scale = rope_freq_scale
|
||||||
|
self.context_params.mul_mat_q = mul_mat_q
|
||||||
|
self.context_params.f16_kv = f16_kv
|
||||||
|
self.context_params.logits_all = logits_all
|
||||||
|
self.context_params.embedding = embedding
|
||||||
|
|
||||||
|
# Sampling Params
|
||||||
self.last_n_tokens_size = last_n_tokens_size
|
self.last_n_tokens_size = last_n_tokens_size
|
||||||
self.n_batch = min(n_ctx, n_batch)
|
|
||||||
|
|
||||||
self.cache: Optional[BaseLlamaCache] = None
|
self.cache: Optional[BaseLlamaCache] = None
|
||||||
|
|
||||||
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
|
|
||||||
|
|
||||||
self.lora_base = lora_base
|
self.lora_base = lora_base
|
||||||
|
self.lora_scale = lora_scale
|
||||||
self.lora_path = lora_path
|
self.lora_path = lora_path
|
||||||
|
|
||||||
if not os.path.exists(model_path):
|
if not os.path.exists(model_path):
|
||||||
|
@ -330,21 +350,23 @@ class Llama:
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
self.model = llama_cpp.llama_load_model_from_file(
|
self.model = llama_cpp.llama_load_model_from_file(
|
||||||
self.model_path.encode("utf-8"), self.params
|
self.model_path.encode("utf-8"), self.model_params
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
with suppress_stdout_stderr():
|
with suppress_stdout_stderr():
|
||||||
self.model = llama_cpp.llama_load_model_from_file(
|
self.model = llama_cpp.llama_load_model_from_file(
|
||||||
self.model_path.encode("utf-8"), self.params
|
self.model_path.encode("utf-8"), self.model_params
|
||||||
)
|
)
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params)
|
self.ctx = llama_cpp.llama_new_context_with_model(
|
||||||
|
self.model, self.context_params
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
with suppress_stdout_stderr():
|
with suppress_stdout_stderr():
|
||||||
self.ctx = llama_cpp.llama_new_context_with_model(
|
self.ctx = llama_cpp.llama_new_context_with_model(
|
||||||
self.model, self.params
|
self.model, self.context_params
|
||||||
)
|
)
|
||||||
|
|
||||||
assert self.ctx is not None
|
assert self.ctx is not None
|
||||||
|
@ -353,6 +375,7 @@ class Llama:
|
||||||
if llama_cpp.llama_model_apply_lora_from_file(
|
if llama_cpp.llama_model_apply_lora_from_file(
|
||||||
self.model,
|
self.model,
|
||||||
self.lora_path.encode("utf-8"),
|
self.lora_path.encode("utf-8"),
|
||||||
|
self.lora_scale,
|
||||||
self.lora_base.encode("utf-8")
|
self.lora_base.encode("utf-8")
|
||||||
if self.lora_base is not None
|
if self.lora_base is not None
|
||||||
else llama_cpp.c_char_p(0),
|
else llama_cpp.c_char_p(0),
|
||||||
|
@ -409,7 +432,7 @@ class Llama:
|
||||||
def eval_logits(self) -> Deque[List[float]]:
|
def eval_logits(self) -> Deque[List[float]]:
|
||||||
return deque(
|
return deque(
|
||||||
self.scores[: self.n_tokens, :].tolist(),
|
self.scores[: self.n_tokens, :].tolist(),
|
||||||
maxlen=self._n_ctx if self.params.logits_all else 1,
|
maxlen=self._n_ctx if self.model_params.logits_all else 1,
|
||||||
)
|
)
|
||||||
|
|
||||||
def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
|
def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
|
||||||
|
@ -427,7 +450,7 @@ class Llama:
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
n_ctx = self._n_ctx
|
n_ctx = self._n_ctx
|
||||||
tokens = (llama_cpp.llama_token * n_ctx)()
|
tokens = (llama_cpp.llama_token * n_ctx)()
|
||||||
n_tokens = llama_cpp.llama_tokenize_with_model(
|
n_tokens = llama_cpp.llama_tokenize(
|
||||||
self.model,
|
self.model,
|
||||||
text,
|
text,
|
||||||
len(text),
|
len(text),
|
||||||
|
@ -438,7 +461,7 @@ class Llama:
|
||||||
if n_tokens < 0:
|
if n_tokens < 0:
|
||||||
n_tokens = abs(n_tokens)
|
n_tokens = abs(n_tokens)
|
||||||
tokens = (llama_cpp.llama_token * n_tokens)()
|
tokens = (llama_cpp.llama_token * n_tokens)()
|
||||||
n_tokens = llama_cpp.llama_tokenize_with_model(
|
n_tokens = llama_cpp.llama_tokenize(
|
||||||
self.model,
|
self.model,
|
||||||
text,
|
text,
|
||||||
len(text),
|
len(text),
|
||||||
|
@ -466,14 +489,16 @@ class Llama:
|
||||||
size = 32
|
size = 32
|
||||||
buffer = (ctypes.c_char * size)()
|
buffer = (ctypes.c_char * size)()
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
n = llama_cpp.llama_token_to_piece_with_model(
|
n = llama_cpp.llama_token_to_piece(
|
||||||
self.model, llama_cpp.llama_token(token), buffer, size
|
self.model, llama_cpp.llama_token(token), buffer, size
|
||||||
)
|
)
|
||||||
assert n <= size
|
assert n <= size
|
||||||
output += bytes(buffer[:n])
|
output += bytes(buffer[:n])
|
||||||
# NOTE: Llama1 models automatically added a space at the start of the prompt
|
# NOTE: Llama1 models automatically added a space at the start of the prompt
|
||||||
# this line removes a leading space if the first token is a beginning of sentence token
|
# this line removes a leading space if the first token is a beginning of sentence token
|
||||||
return output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
|
return (
|
||||||
|
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
|
||||||
|
)
|
||||||
|
|
||||||
def set_cache(self, cache: Optional[BaseLlamaCache]):
|
def set_cache(self, cache: Optional[BaseLlamaCache]):
|
||||||
"""Set the cache.
|
"""Set the cache.
|
||||||
|
@ -504,17 +529,16 @@ class Llama:
|
||||||
tokens=(llama_cpp.llama_token * len(batch))(*batch),
|
tokens=(llama_cpp.llama_token * len(batch))(*batch),
|
||||||
n_tokens=n_tokens,
|
n_tokens=n_tokens,
|
||||||
n_past=n_past,
|
n_past=n_past,
|
||||||
n_threads=self.n_threads,
|
|
||||||
)
|
)
|
||||||
if return_code != 0:
|
if return_code != 0:
|
||||||
raise RuntimeError(f"llama_eval returned {return_code}")
|
raise RuntimeError(f"llama_eval returned {return_code}")
|
||||||
# Save tokens
|
# Save tokens
|
||||||
self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
|
self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
|
||||||
# Save logits
|
# Save logits
|
||||||
rows = n_tokens if self.params.logits_all else 1
|
rows = n_tokens if self.context_params.logits_all else 1
|
||||||
cols = self._n_vocab
|
cols = self._n_vocab
|
||||||
offset = (
|
offset = (
|
||||||
0 if self.params.logits_all else n_tokens - 1
|
0 if self.context_params.logits_all else n_tokens - 1
|
||||||
) # NOTE: Only save the last token logits if logits_all is False
|
) # NOTE: Only save the last token logits if logits_all is False
|
||||||
self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape(
|
self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape(
|
||||||
-1
|
-1
|
||||||
|
@ -545,11 +569,7 @@ class Llama:
|
||||||
n_vocab = self._n_vocab
|
n_vocab = self._n_vocab
|
||||||
n_ctx = self._n_ctx
|
n_ctx = self._n_ctx
|
||||||
top_k = n_vocab if top_k <= 0 else top_k
|
top_k = n_vocab if top_k <= 0 else top_k
|
||||||
last_n_tokens_size = (
|
last_n_tokens_size = n_ctx if last_n_tokens_size < 0 else last_n_tokens_size
|
||||||
n_ctx
|
|
||||||
if last_n_tokens_size < 0
|
|
||||||
else last_n_tokens_size
|
|
||||||
)
|
|
||||||
logits: npt.NDArray[np.single] = self._scores[-1, :]
|
logits: npt.NDArray[np.single] = self._scores[-1, :]
|
||||||
|
|
||||||
if logits_processor is not None:
|
if logits_processor is not None:
|
||||||
|
@ -610,7 +630,7 @@ class Llama:
|
||||||
mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore
|
mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore
|
||||||
m=mirostat_m,
|
m=mirostat_m,
|
||||||
)
|
)
|
||||||
elif mirostat_mode== 2:
|
elif mirostat_mode == 2:
|
||||||
mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau)
|
mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau)
|
||||||
llama_cpp.llama_sample_temperature(
|
llama_cpp.llama_sample_temperature(
|
||||||
ctx=self.ctx,
|
ctx=self.ctx,
|
||||||
|
@ -802,7 +822,7 @@ class Llama:
|
||||||
|
|
||||||
def create_embedding(
|
def create_embedding(
|
||||||
self, input: Union[str, List[str]], model: Optional[str] = None
|
self, input: Union[str, List[str]], model: Optional[str] = None
|
||||||
) -> Embedding:
|
) -> CreateEmbeddingResponse:
|
||||||
"""Embed a string.
|
"""Embed a string.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -814,7 +834,7 @@ class Llama:
|
||||||
assert self.ctx is not None
|
assert self.ctx is not None
|
||||||
model_name: str = model if model is not None else self.model_path
|
model_name: str = model if model is not None else self.model_path
|
||||||
|
|
||||||
if self.params.embedding == False:
|
if self.model_params.embedding == False:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Llama model must be created with embedding=True to call this method"
|
"Llama model must be created with embedding=True to call this method"
|
||||||
)
|
)
|
||||||
|
@ -900,7 +920,11 @@ class Llama:
|
||||||
created: int = int(time.time())
|
created: int = int(time.time())
|
||||||
completion_tokens: List[int] = []
|
completion_tokens: List[int] = []
|
||||||
# Add blank space to start of prompt to match OG llama tokenizer
|
# Add blank space to start of prompt to match OG llama tokenizer
|
||||||
prompt_tokens: List[int] = self.tokenize(prompt.encode("utf-8")) if prompt != "" else [self.token_bos()]
|
prompt_tokens: List[int] = (
|
||||||
|
self.tokenize(prompt.encode("utf-8"))
|
||||||
|
if prompt != ""
|
||||||
|
else [self.token_bos()]
|
||||||
|
)
|
||||||
text: bytes = b""
|
text: bytes = b""
|
||||||
returned_tokens: int = 0
|
returned_tokens: int = 0
|
||||||
stop = (
|
stop = (
|
||||||
|
@ -932,7 +956,7 @@ class Llama:
|
||||||
else:
|
else:
|
||||||
stop_sequences = []
|
stop_sequences = []
|
||||||
|
|
||||||
if logprobs is not None and self.params.logits_all is False:
|
if logprobs is not None and self.model_params.logits_all is False:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"logprobs is not supported for models created with logits_all=False"
|
"logprobs is not supported for models created with logits_all=False"
|
||||||
)
|
)
|
||||||
|
@ -1025,7 +1049,9 @@ class Llama:
|
||||||
for token in remaining_tokens:
|
for token in remaining_tokens:
|
||||||
token_end_position += len(self.detokenize([token]))
|
token_end_position += len(self.detokenize([token]))
|
||||||
# Check if stop sequence is in the token
|
# Check if stop sequence is in the token
|
||||||
if token_end_position > (remaining_length - first_stop_position):
|
if token_end_position > (
|
||||||
|
remaining_length - first_stop_position
|
||||||
|
):
|
||||||
break
|
break
|
||||||
token_str = self.detokenize([token]).decode(
|
token_str = self.detokenize([token]).decode(
|
||||||
"utf-8", errors="ignore"
|
"utf-8", errors="ignore"
|
||||||
|
@ -1082,7 +1108,7 @@ class Llama:
|
||||||
for i in range(1, len(remaining_tokens) + 1):
|
for i in range(1, len(remaining_tokens) + 1):
|
||||||
try:
|
try:
|
||||||
bs = self.detokenize(remaining_tokens[:i])
|
bs = self.detokenize(remaining_tokens[:i])
|
||||||
ts = bs.decode('utf-8')
|
ts = bs.decode("utf-8")
|
||||||
decode_success = True
|
decode_success = True
|
||||||
break
|
break
|
||||||
except UnicodeError:
|
except UnicodeError:
|
||||||
|
@ -1093,7 +1119,9 @@ class Llama:
|
||||||
# all remaining tokens cannot be decoded to a UTF-8 character
|
# all remaining tokens cannot be decoded to a UTF-8 character
|
||||||
break
|
break
|
||||||
token_end_position += len(bs)
|
token_end_position += len(bs)
|
||||||
if token_end_position > (remaining_length - first_stop_position):
|
if token_end_position > (
|
||||||
|
remaining_length - first_stop_position
|
||||||
|
):
|
||||||
break
|
break
|
||||||
remaining_tokens = remaining_tokens[i:]
|
remaining_tokens = remaining_tokens[i:]
|
||||||
returned_tokens += i
|
returned_tokens += i
|
||||||
|
@ -1398,7 +1426,7 @@ class Llama:
|
||||||
model=model,
|
model=model,
|
||||||
stopping_criteria=stopping_criteria,
|
stopping_criteria=stopping_criteria,
|
||||||
logits_processor=logits_processor,
|
logits_processor=logits_processor,
|
||||||
grammar=grammar
|
grammar=grammar,
|
||||||
)
|
)
|
||||||
if stream:
|
if stream:
|
||||||
chunks: Iterator[CompletionChunk] = completion_or_chunks
|
chunks: Iterator[CompletionChunk] = completion_or_chunks
|
||||||
|
@ -1618,47 +1646,68 @@ class Llama:
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
return dict(
|
return dict(
|
||||||
verbose=self.verbose,
|
|
||||||
model_path=self.model_path,
|
model_path=self.model_path,
|
||||||
n_ctx=self.params.n_ctx,
|
# Model Params
|
||||||
n_gpu_layers=self.params.n_gpu_layers,
|
n_gpu_layers=self.model_params.n_gpu_layers,
|
||||||
seed=self.params.seed,
|
main_gpu=self.model_params.main_gpu,
|
||||||
f16_kv=self.params.f16_kv,
|
|
||||||
logits_all=self.params.logits_all,
|
|
||||||
vocab_only=self.params.vocab_only,
|
|
||||||
use_mmap=self.params.use_mmap,
|
|
||||||
use_mlock=self.params.use_mlock,
|
|
||||||
embedding=self.params.embedding,
|
|
||||||
low_vram=self.params.low_vram,
|
|
||||||
last_n_tokens_size=self.last_n_tokens_size,
|
|
||||||
n_batch=self.n_batch,
|
|
||||||
n_threads=self.n_threads,
|
|
||||||
lora_base=self.lora_base,
|
|
||||||
lora_path=self.lora_path,
|
|
||||||
tensor_split=self.tensor_split,
|
tensor_split=self.tensor_split,
|
||||||
mul_mat_q=self.params.mul_mat_q,
|
vocab_only=self.model_params.vocab_only,
|
||||||
|
use_mmap=self.model_params.use_mmap,
|
||||||
|
use_mlock=self.model_params.use_mlock,
|
||||||
|
# Context Params
|
||||||
|
seed=self.context_params.seed,
|
||||||
|
n_ctx=self.context_params.n_ctx,
|
||||||
|
n_batch=self.n_batch,
|
||||||
|
n_threads=self.context_params.n_threads,
|
||||||
|
n_threads_batch=self.context_params.n_threads_batch,
|
||||||
|
rope_freq_base=self.context_params.rope_freq_base,
|
||||||
|
rope_freq_scale=self.context_params.rope_freq_scale,
|
||||||
|
mul_mat_q=self.context_params.mul_mat_q,
|
||||||
|
f16_kv=self.context_params.f16_kv,
|
||||||
|
logits_all=self.context_params.logits_all,
|
||||||
|
embedding=self.context_params.embedding,
|
||||||
|
# Sampling Params
|
||||||
|
last_n_tokens_size=self.last_n_tokens_size,
|
||||||
|
# LoRA Params
|
||||||
|
lora_base=self.lora_base,
|
||||||
|
lora_scale=self.lora_scale,
|
||||||
|
lora_path=self.lora_path,
|
||||||
|
# Backend Params
|
||||||
|
numa=self.numa,
|
||||||
|
# Misc
|
||||||
|
verbose=self.verbose,
|
||||||
)
|
)
|
||||||
|
|
||||||
def __setstate__(self, state):
|
def __setstate__(self, state):
|
||||||
self.__init__(
|
self.__init__(
|
||||||
model_path=state["model_path"],
|
model_path=state["model_path"],
|
||||||
n_ctx=state["n_ctx"],
|
# Model Params
|
||||||
n_gpu_layers=state["n_gpu_layers"],
|
n_gpu_layers=state["n_gpu_layers"],
|
||||||
seed=state["seed"],
|
main_gpu=state["main_gpu"],
|
||||||
f16_kv=state["f16_kv"],
|
tensor_split=state["tensor_split"],
|
||||||
logits_all=state["logits_all"],
|
|
||||||
vocab_only=state["vocab_only"],
|
vocab_only=state["vocab_only"],
|
||||||
use_mmap=state["use_mmap"],
|
use_mmap=state["use_mmap"],
|
||||||
use_mlock=state["use_mlock"],
|
use_mlock=state["use_mlock"],
|
||||||
embedding=state["embedding"],
|
# Context Params
|
||||||
low_vram=state["low_vram"],
|
seed=state["seed"],
|
||||||
n_threads=state["n_threads"],
|
n_ctx=state["n_ctx"],
|
||||||
n_batch=state["n_batch"],
|
n_batch=state["n_batch"],
|
||||||
|
n_threads=state["n_threads"],
|
||||||
|
n_threads_batch=state["n_threads_batch"],
|
||||||
|
rope_freq_base=state["rope_freq_base"],
|
||||||
|
rope_freq_scale=state["rope_freq_scale"],
|
||||||
|
mul_mat_q=state["mul_mat_q"],
|
||||||
|
f16_kv=state["f16_kv"],
|
||||||
|
logits_all=state["logits_all"],
|
||||||
|
embedding=state["embedding"],
|
||||||
|
# Sampling Params
|
||||||
last_n_tokens_size=state["last_n_tokens_size"],
|
last_n_tokens_size=state["last_n_tokens_size"],
|
||||||
|
# LoRA Params
|
||||||
lora_base=state["lora_base"],
|
lora_base=state["lora_base"],
|
||||||
lora_path=state["lora_path"],
|
lora_path=state["lora_path"],
|
||||||
tensor_split=state["tensor_split"],
|
# Backend Params
|
||||||
mul_mat_q=state["mul_mat_q"],
|
numa=state["numa"],
|
||||||
|
# Misc
|
||||||
verbose=state["verbose"],
|
verbose=state["verbose"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1711,13 +1760,13 @@ class Llama:
|
||||||
|
|
||||||
def n_embd(self) -> int:
|
def n_embd(self) -> int:
|
||||||
"""Return the embedding size."""
|
"""Return the embedding size."""
|
||||||
assert self.ctx is not None
|
assert self.model is not None
|
||||||
return llama_cpp.llama_n_embd(self.ctx)
|
return llama_cpp.llama_n_embd(self.model)
|
||||||
|
|
||||||
def n_vocab(self) -> int:
|
def n_vocab(self) -> int:
|
||||||
"""Return the vocabulary size."""
|
"""Return the vocabulary size."""
|
||||||
assert self.ctx is not None
|
assert self.model is not None
|
||||||
return llama_cpp.llama_n_vocab(self.ctx)
|
return llama_cpp.llama_n_vocab(self.model)
|
||||||
|
|
||||||
def tokenizer(self) -> "LlamaTokenizer":
|
def tokenizer(self) -> "LlamaTokenizer":
|
||||||
"""Return the tokenizer for this model."""
|
"""Return the tokenizer for this model."""
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -119,7 +119,7 @@ def test_llama_pickle():
|
||||||
|
|
||||||
def test_utf8(monkeypatch):
|
def test_utf8(monkeypatch):
|
||||||
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
|
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
|
||||||
n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
|
n_vocab = llama.n_vocab()
|
||||||
|
|
||||||
## Set up mock function
|
## Set up mock function
|
||||||
def mock_eval(*args, **kwargs):
|
def mock_eval(*args, **kwargs):
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit a98b1633d5a94d0aa84c7c16e1f8df5ac21fc850
|
Subproject commit 16bc66d9479edd5ee12ec734973554d4493c5dfa
|
Loading…
Reference in a new issue