Update llama.cpp

This commit is contained in:
Andrei Betlen 2023-09-28 22:42:03 -04:00
parent 4177ae6d34
commit 1a1c3dc418
4 changed files with 742 additions and 356 deletions

View file

@ -30,6 +30,7 @@ import numpy.typing as npt
from .utils import suppress_stdout_stderr from .utils import suppress_stdout_stderr
class BaseLlamaCache(ABC): class BaseLlamaCache(ABC):
"""Base cache class for a llama.cpp model.""" """Base cache class for a llama.cpp model."""
@ -215,30 +216,37 @@ class Llama:
self, self,
model_path: str, model_path: str,
*, *,
# NOTE: These parameters are likely to change in the future. # Model Params
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
n_ctx: int = 512,
n_batch: int = 512,
n_gpu_layers: int = 0, n_gpu_layers: int = 0,
main_gpu: int = 0, main_gpu: int = 0,
tensor_split: Optional[List[float]] = None, tensor_split: Optional[List[float]] = None,
rope_freq_base: float = 10000.0,
rope_freq_scale: float = 1.0,
low_vram: bool = False,
mul_mat_q: bool = True,
f16_kv: bool = True,
logits_all: bool = False,
vocab_only: bool = False, vocab_only: bool = False,
use_mmap: bool = True, use_mmap: bool = True,
use_mlock: bool = False, use_mlock: bool = False,
embedding: bool = False, # Context Params
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
n_ctx: int = 512,
n_batch: int = 512,
n_threads: Optional[int] = None, n_threads: Optional[int] = None,
n_threads_batch: Optional[int] = None,
rope_freq_base: float = 10000.0,
rope_freq_scale: float = 1.0,
mul_mat_q: bool = True,
f16_kv: bool = True,
logits_all: bool = False,
embedding: bool = False,
# Sampling Params
last_n_tokens_size: int = 64, last_n_tokens_size: int = 64,
# LoRA Params
lora_base: Optional[str] = None, lora_base: Optional[str] = None,
lora_scale: float = 1.0,
lora_path: Optional[str] = None, lora_path: Optional[str] = None,
# Backend Params
numa: bool = False, numa: bool = False,
# Misc
verbose: bool = True, verbose: bool = True,
**kwargs # type: ignore # Extra Params
**kwargs, # type: ignore
): ):
"""Load a llama.cpp model from `model_path`. """Load a llama.cpp model from `model_path`.
@ -277,52 +285,64 @@ class Llama:
self.verbose = verbose self.verbose = verbose
self.numa = numa
if not Llama.__backend_initialized: if not Llama.__backend_initialized:
if self.verbose: if self.verbose:
llama_cpp.llama_backend_init(numa) llama_cpp.llama_backend_init(self.numa)
else: else:
with suppress_stdout_stderr(): with suppress_stdout_stderr():
llama_cpp.llama_backend_init(numa) llama_cpp.llama_backend_init(self.numa)
Llama.__backend_initialized = True Llama.__backend_initialized = True
self.model_path = model_path self.model_path = model_path
self.params = llama_cpp.llama_context_default_params() # Model Params
self.params.seed = seed self.model_params = llama_cpp.llama_model_default_params()
self.params.n_ctx = n_ctx self.model_params.n_gpu_layers = (
self.params.n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers # 0x7FFFFFFF is INT32 max, will be auto set to all layers 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
self.params.main_gpu = main_gpu ) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
self.params.rope_freq_base = rope_freq_base self.model_params.main_gpu = main_gpu
self.params.rope_freq_scale = rope_freq_scale
self.params.low_vram = low_vram
self.params.mul_mat_q = mul_mat_q
self.params.f16_kv = f16_kv
self.params.logits_all = logits_all
self.params.vocab_only = vocab_only
self.params.use_mmap = use_mmap if lora_path is None else False
self.params.use_mlock = use_mlock
self.params.embedding = embedding
self.tensor_split = tensor_split self.tensor_split = tensor_split
self._p_tensor_split = None self._p_tensor_split = None
if self.tensor_split is not None: if self.tensor_split is not None:
# Type conversion and expand the list to the length of LLAMA_MAX_DEVICES # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
self._c_tensor_split = FloatArray( self._c_tensor_split = FloatArray(
*tensor_split *tensor_split # type: ignore
) # keep a reference to the array so it is not gc'd ) # keep a reference to the array so it is not gc'd
self.params.tensor_split = self._c_tensor_split self.model_params.tensor_split = self._c_tensor_split
self.model_params.vocab_only = vocab_only
self.model_params.use_mmap = use_mmap if lora_path is None else False
self.model_params.use_mlock = use_mlock
self.n_batch = min(n_ctx, n_batch) # ???
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
self.n_threads_batch = n_threads_batch or max(
multiprocessing.cpu_count() // 2, 1
)
# Context Params
self.context_params = llama_cpp.llama_context_default_params()
self.context_params.seed = seed
self.context_params.n_ctx = n_ctx
self.context_params.n_batch = self.n_batch
self.context_params.n_threads = self.n_threads
self.context_params.n_threads_batch = self.n_threads_batch
self.context_params.rope_freq_base = rope_freq_base
self.context_params.rope_freq_scale = rope_freq_scale
self.context_params.mul_mat_q = mul_mat_q
self.context_params.f16_kv = f16_kv
self.context_params.logits_all = logits_all
self.context_params.embedding = embedding
# Sampling Params
self.last_n_tokens_size = last_n_tokens_size self.last_n_tokens_size = last_n_tokens_size
self.n_batch = min(n_ctx, n_batch)
self.cache: Optional[BaseLlamaCache] = None self.cache: Optional[BaseLlamaCache] = None
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
self.lora_base = lora_base self.lora_base = lora_base
self.lora_scale = lora_scale
self.lora_path = lora_path self.lora_path = lora_path
if not os.path.exists(model_path): if not os.path.exists(model_path):
@ -330,21 +350,23 @@ class Llama:
if verbose: if verbose:
self.model = llama_cpp.llama_load_model_from_file( self.model = llama_cpp.llama_load_model_from_file(
self.model_path.encode("utf-8"), self.params self.model_path.encode("utf-8"), self.model_params
) )
else: else:
with suppress_stdout_stderr(): with suppress_stdout_stderr():
self.model = llama_cpp.llama_load_model_from_file( self.model = llama_cpp.llama_load_model_from_file(
self.model_path.encode("utf-8"), self.params self.model_path.encode("utf-8"), self.model_params
) )
assert self.model is not None assert self.model is not None
if verbose: if verbose:
self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params) self.ctx = llama_cpp.llama_new_context_with_model(
self.model, self.context_params
)
else: else:
with suppress_stdout_stderr(): with suppress_stdout_stderr():
self.ctx = llama_cpp.llama_new_context_with_model( self.ctx = llama_cpp.llama_new_context_with_model(
self.model, self.params self.model, self.context_params
) )
assert self.ctx is not None assert self.ctx is not None
@ -353,6 +375,7 @@ class Llama:
if llama_cpp.llama_model_apply_lora_from_file( if llama_cpp.llama_model_apply_lora_from_file(
self.model, self.model,
self.lora_path.encode("utf-8"), self.lora_path.encode("utf-8"),
self.lora_scale,
self.lora_base.encode("utf-8") self.lora_base.encode("utf-8")
if self.lora_base is not None if self.lora_base is not None
else llama_cpp.c_char_p(0), else llama_cpp.c_char_p(0),
@ -409,7 +432,7 @@ class Llama:
def eval_logits(self) -> Deque[List[float]]: def eval_logits(self) -> Deque[List[float]]:
return deque( return deque(
self.scores[: self.n_tokens, :].tolist(), self.scores[: self.n_tokens, :].tolist(),
maxlen=self._n_ctx if self.params.logits_all else 1, maxlen=self._n_ctx if self.model_params.logits_all else 1,
) )
def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
@ -427,7 +450,7 @@ class Llama:
assert self.model is not None assert self.model is not None
n_ctx = self._n_ctx n_ctx = self._n_ctx
tokens = (llama_cpp.llama_token * n_ctx)() tokens = (llama_cpp.llama_token * n_ctx)()
n_tokens = llama_cpp.llama_tokenize_with_model( n_tokens = llama_cpp.llama_tokenize(
self.model, self.model,
text, text,
len(text), len(text),
@ -438,7 +461,7 @@ class Llama:
if n_tokens < 0: if n_tokens < 0:
n_tokens = abs(n_tokens) n_tokens = abs(n_tokens)
tokens = (llama_cpp.llama_token * n_tokens)() tokens = (llama_cpp.llama_token * n_tokens)()
n_tokens = llama_cpp.llama_tokenize_with_model( n_tokens = llama_cpp.llama_tokenize(
self.model, self.model,
text, text,
len(text), len(text),
@ -466,14 +489,16 @@ class Llama:
size = 32 size = 32
buffer = (ctypes.c_char * size)() buffer = (ctypes.c_char * size)()
for token in tokens: for token in tokens:
n = llama_cpp.llama_token_to_piece_with_model( n = llama_cpp.llama_token_to_piece(
self.model, llama_cpp.llama_token(token), buffer, size self.model, llama_cpp.llama_token(token), buffer, size
) )
assert n <= size assert n <= size
output += bytes(buffer[:n]) output += bytes(buffer[:n])
# NOTE: Llama1 models automatically added a space at the start of the prompt # NOTE: Llama1 models automatically added a space at the start of the prompt
# this line removes a leading space if the first token is a beginning of sentence token # this line removes a leading space if the first token is a beginning of sentence token
return output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output return (
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
)
def set_cache(self, cache: Optional[BaseLlamaCache]): def set_cache(self, cache: Optional[BaseLlamaCache]):
"""Set the cache. """Set the cache.
@ -504,17 +529,16 @@ class Llama:
tokens=(llama_cpp.llama_token * len(batch))(*batch), tokens=(llama_cpp.llama_token * len(batch))(*batch),
n_tokens=n_tokens, n_tokens=n_tokens,
n_past=n_past, n_past=n_past,
n_threads=self.n_threads,
) )
if return_code != 0: if return_code != 0:
raise RuntimeError(f"llama_eval returned {return_code}") raise RuntimeError(f"llama_eval returned {return_code}")
# Save tokens # Save tokens
self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
# Save logits # Save logits
rows = n_tokens if self.params.logits_all else 1 rows = n_tokens if self.context_params.logits_all else 1
cols = self._n_vocab cols = self._n_vocab
offset = ( offset = (
0 if self.params.logits_all else n_tokens - 1 0 if self.context_params.logits_all else n_tokens - 1
) # NOTE: Only save the last token logits if logits_all is False ) # NOTE: Only save the last token logits if logits_all is False
self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape( self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape(
-1 -1
@ -545,11 +569,7 @@ class Llama:
n_vocab = self._n_vocab n_vocab = self._n_vocab
n_ctx = self._n_ctx n_ctx = self._n_ctx
top_k = n_vocab if top_k <= 0 else top_k top_k = n_vocab if top_k <= 0 else top_k
last_n_tokens_size = ( last_n_tokens_size = n_ctx if last_n_tokens_size < 0 else last_n_tokens_size
n_ctx
if last_n_tokens_size < 0
else last_n_tokens_size
)
logits: npt.NDArray[np.single] = self._scores[-1, :] logits: npt.NDArray[np.single] = self._scores[-1, :]
if logits_processor is not None: if logits_processor is not None:
@ -610,7 +630,7 @@ class Llama:
mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore
m=mirostat_m, m=mirostat_m,
) )
elif mirostat_mode== 2: elif mirostat_mode == 2:
mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau) mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau)
llama_cpp.llama_sample_temperature( llama_cpp.llama_sample_temperature(
ctx=self.ctx, ctx=self.ctx,
@ -802,7 +822,7 @@ class Llama:
def create_embedding( def create_embedding(
self, input: Union[str, List[str]], model: Optional[str] = None self, input: Union[str, List[str]], model: Optional[str] = None
) -> Embedding: ) -> CreateEmbeddingResponse:
"""Embed a string. """Embed a string.
Args: Args:
@ -814,7 +834,7 @@ class Llama:
assert self.ctx is not None assert self.ctx is not None
model_name: str = model if model is not None else self.model_path model_name: str = model if model is not None else self.model_path
if self.params.embedding == False: if self.model_params.embedding == False:
raise RuntimeError( raise RuntimeError(
"Llama model must be created with embedding=True to call this method" "Llama model must be created with embedding=True to call this method"
) )
@ -900,7 +920,11 @@ class Llama:
created: int = int(time.time()) created: int = int(time.time())
completion_tokens: List[int] = [] completion_tokens: List[int] = []
# Add blank space to start of prompt to match OG llama tokenizer # Add blank space to start of prompt to match OG llama tokenizer
prompt_tokens: List[int] = self.tokenize(prompt.encode("utf-8")) if prompt != "" else [self.token_bos()] prompt_tokens: List[int] = (
self.tokenize(prompt.encode("utf-8"))
if prompt != ""
else [self.token_bos()]
)
text: bytes = b"" text: bytes = b""
returned_tokens: int = 0 returned_tokens: int = 0
stop = ( stop = (
@ -932,7 +956,7 @@ class Llama:
else: else:
stop_sequences = [] stop_sequences = []
if logprobs is not None and self.params.logits_all is False: if logprobs is not None and self.model_params.logits_all is False:
raise ValueError( raise ValueError(
"logprobs is not supported for models created with logits_all=False" "logprobs is not supported for models created with logits_all=False"
) )
@ -1025,7 +1049,9 @@ class Llama:
for token in remaining_tokens: for token in remaining_tokens:
token_end_position += len(self.detokenize([token])) token_end_position += len(self.detokenize([token]))
# Check if stop sequence is in the token # Check if stop sequence is in the token
if token_end_position > (remaining_length - first_stop_position): if token_end_position > (
remaining_length - first_stop_position
):
break break
token_str = self.detokenize([token]).decode( token_str = self.detokenize([token]).decode(
"utf-8", errors="ignore" "utf-8", errors="ignore"
@ -1082,7 +1108,7 @@ class Llama:
for i in range(1, len(remaining_tokens) + 1): for i in range(1, len(remaining_tokens) + 1):
try: try:
bs = self.detokenize(remaining_tokens[:i]) bs = self.detokenize(remaining_tokens[:i])
ts = bs.decode('utf-8') ts = bs.decode("utf-8")
decode_success = True decode_success = True
break break
except UnicodeError: except UnicodeError:
@ -1093,7 +1119,9 @@ class Llama:
# all remaining tokens cannot be decoded to a UTF-8 character # all remaining tokens cannot be decoded to a UTF-8 character
break break
token_end_position += len(bs) token_end_position += len(bs)
if token_end_position > (remaining_length - first_stop_position): if token_end_position > (
remaining_length - first_stop_position
):
break break
remaining_tokens = remaining_tokens[i:] remaining_tokens = remaining_tokens[i:]
returned_tokens += i returned_tokens += i
@ -1398,7 +1426,7 @@ class Llama:
model=model, model=model,
stopping_criteria=stopping_criteria, stopping_criteria=stopping_criteria,
logits_processor=logits_processor, logits_processor=logits_processor,
grammar=grammar grammar=grammar,
) )
if stream: if stream:
chunks: Iterator[CompletionChunk] = completion_or_chunks chunks: Iterator[CompletionChunk] = completion_or_chunks
@ -1618,47 +1646,68 @@ class Llama:
def __getstate__(self): def __getstate__(self):
return dict( return dict(
verbose=self.verbose,
model_path=self.model_path, model_path=self.model_path,
n_ctx=self.params.n_ctx, # Model Params
n_gpu_layers=self.params.n_gpu_layers, n_gpu_layers=self.model_params.n_gpu_layers,
seed=self.params.seed, main_gpu=self.model_params.main_gpu,
f16_kv=self.params.f16_kv,
logits_all=self.params.logits_all,
vocab_only=self.params.vocab_only,
use_mmap=self.params.use_mmap,
use_mlock=self.params.use_mlock,
embedding=self.params.embedding,
low_vram=self.params.low_vram,
last_n_tokens_size=self.last_n_tokens_size,
n_batch=self.n_batch,
n_threads=self.n_threads,
lora_base=self.lora_base,
lora_path=self.lora_path,
tensor_split=self.tensor_split, tensor_split=self.tensor_split,
mul_mat_q=self.params.mul_mat_q, vocab_only=self.model_params.vocab_only,
use_mmap=self.model_params.use_mmap,
use_mlock=self.model_params.use_mlock,
# Context Params
seed=self.context_params.seed,
n_ctx=self.context_params.n_ctx,
n_batch=self.n_batch,
n_threads=self.context_params.n_threads,
n_threads_batch=self.context_params.n_threads_batch,
rope_freq_base=self.context_params.rope_freq_base,
rope_freq_scale=self.context_params.rope_freq_scale,
mul_mat_q=self.context_params.mul_mat_q,
f16_kv=self.context_params.f16_kv,
logits_all=self.context_params.logits_all,
embedding=self.context_params.embedding,
# Sampling Params
last_n_tokens_size=self.last_n_tokens_size,
# LoRA Params
lora_base=self.lora_base,
lora_scale=self.lora_scale,
lora_path=self.lora_path,
# Backend Params
numa=self.numa,
# Misc
verbose=self.verbose,
) )
def __setstate__(self, state): def __setstate__(self, state):
self.__init__( self.__init__(
model_path=state["model_path"], model_path=state["model_path"],
n_ctx=state["n_ctx"], # Model Params
n_gpu_layers=state["n_gpu_layers"], n_gpu_layers=state["n_gpu_layers"],
seed=state["seed"], main_gpu=state["main_gpu"],
f16_kv=state["f16_kv"], tensor_split=state["tensor_split"],
logits_all=state["logits_all"],
vocab_only=state["vocab_only"], vocab_only=state["vocab_only"],
use_mmap=state["use_mmap"], use_mmap=state["use_mmap"],
use_mlock=state["use_mlock"], use_mlock=state["use_mlock"],
embedding=state["embedding"], # Context Params
low_vram=state["low_vram"], seed=state["seed"],
n_threads=state["n_threads"], n_ctx=state["n_ctx"],
n_batch=state["n_batch"], n_batch=state["n_batch"],
n_threads=state["n_threads"],
n_threads_batch=state["n_threads_batch"],
rope_freq_base=state["rope_freq_base"],
rope_freq_scale=state["rope_freq_scale"],
mul_mat_q=state["mul_mat_q"],
f16_kv=state["f16_kv"],
logits_all=state["logits_all"],
embedding=state["embedding"],
# Sampling Params
last_n_tokens_size=state["last_n_tokens_size"], last_n_tokens_size=state["last_n_tokens_size"],
# LoRA Params
lora_base=state["lora_base"], lora_base=state["lora_base"],
lora_path=state["lora_path"], lora_path=state["lora_path"],
tensor_split=state["tensor_split"], # Backend Params
mul_mat_q=state["mul_mat_q"], numa=state["numa"],
# Misc
verbose=state["verbose"], verbose=state["verbose"],
) )
@ -1711,13 +1760,13 @@ class Llama:
def n_embd(self) -> int: def n_embd(self) -> int:
"""Return the embedding size.""" """Return the embedding size."""
assert self.ctx is not None assert self.model is not None
return llama_cpp.llama_n_embd(self.ctx) return llama_cpp.llama_n_embd(self.model)
def n_vocab(self) -> int: def n_vocab(self) -> int:
"""Return the vocabulary size.""" """Return the vocabulary size."""
assert self.ctx is not None assert self.model is not None
return llama_cpp.llama_n_vocab(self.ctx) return llama_cpp.llama_n_vocab(self.model)
def tokenizer(self) -> "LlamaTokenizer": def tokenizer(self) -> "LlamaTokenizer":
"""Return the tokenizer for this model.""" """Return the tokenizer for this model."""

File diff suppressed because it is too large Load diff

View file

@ -119,7 +119,7 @@ def test_llama_pickle():
def test_utf8(monkeypatch): def test_utf8(monkeypatch):
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
n_vocab = llama_cpp.llama_n_vocab(llama.ctx) n_vocab = llama.n_vocab()
## Set up mock function ## Set up mock function
def mock_eval(*args, **kwargs): def mock_eval(*args, **kwargs):

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit a98b1633d5a94d0aa84c7c16e1f8df5ac21fc850 Subproject commit 16bc66d9479edd5ee12ec734973554d4493c5dfa