Compare commits
13 commits
fc2af04c15
...
c2e4d5820a
Author | SHA1 | Date | |
---|---|---|---|
c2e4d5820a | |||
|
83d6b26e6f | ||
|
255e1b4495 | ||
|
d634efcdd9 | ||
|
6e0642ca19 | ||
|
027f7bc678 | ||
|
951e39caf9 | ||
|
c3ef41ba06 | ||
|
ae5682f500 | ||
|
cd3f1bb387 | ||
|
6b018e00b1 | ||
|
a6457ba74b | ||
|
af3ed503e9 |
13 changed files with 123 additions and 141 deletions
|
@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.2.77]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@bde7cd3cd949c1a85d3a199498ac98e78039d46f
|
||||||
|
- fix: string value kv_overrides by @abetlen in df45a4b3fe46e72664bda87301b318210c6d4782
|
||||||
|
- fix: Fix typo in Llama3VisionAlphaChatHandler by @abetlen in 165b4dc6c188f8fda2fc616154e111f710484eba
|
||||||
|
- fix: Use numpy recarray for candidates data, fixes bug with temp < 0 by @abetlen in af3ed503e9ce60fe6b5365031abad4176a3536b3
|
||||||
|
fix: Disable Windows+CUDA workaround when compiling for HIPBLAS by Engininja2 in #1493
|
||||||
|
|
||||||
## [0.2.76]
|
## [0.2.76]
|
||||||
|
|
||||||
- feat: Update llama.cpp to ggerganov/llama.cpp@0df0aa8e43c3378975269a51f9b876c8692e70da
|
- feat: Update llama.cpp to ggerganov/llama.cpp@0df0aa8e43c3378975269a51f9b876c8692e70da
|
||||||
|
|
|
@ -41,14 +41,16 @@ if (LLAMA_BUILD)
|
||||||
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
|
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
|
||||||
)
|
)
|
||||||
# Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
|
# Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
|
||||||
install(
|
if (WIN32 AND (LLAMA_CUDA OR LLAMA_CUBLAS))
|
||||||
FILES $<TARGET_RUNTIME_DLLS:llama>
|
install(
|
||||||
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
|
FILES $<TARGET_RUNTIME_DLLS:llama>
|
||||||
)
|
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
|
||||||
install(
|
)
|
||||||
FILES $<TARGET_RUNTIME_DLLS:llama>
|
install(
|
||||||
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
|
FILES $<TARGET_RUNTIME_DLLS:llama>
|
||||||
)
|
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAVA_BUILD)
|
if (LLAVA_BUILD)
|
||||||
if (LLAMA_CUBLAS OR LLAMA_CUDA)
|
if (LLAMA_CUBLAS OR LLAMA_CUDA)
|
||||||
|
|
3
Makefile
3
Makefile
|
@ -45,6 +45,9 @@ build.kompute:
|
||||||
build.sycl:
|
build.sycl:
|
||||||
CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e .
|
CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e .
|
||||||
|
|
||||||
|
build.rpc:
|
||||||
|
CMAKE_ARGS="-DLLAMA_RPC=on" python3 -m pip install --verbose -e .
|
||||||
|
|
||||||
build.sdist:
|
build.sdist:
|
||||||
python3 -m build --sdist
|
python3 -m build --sdist
|
||||||
|
|
||||||
|
|
11
README.md
11
README.md
|
@ -221,6 +221,17 @@ CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pi
|
||||||
```
|
```
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>RPC</summary>
|
||||||
|
|
||||||
|
To install with RPC support, set the `LLAMA_RPC=on` environment variable before installing:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
CMAKE_ARGS="-DLLAMA_RPC=on" pip install llama-cpp-python
|
||||||
|
```
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
### Windows Notes
|
### Windows Notes
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .llama_cpp import *
|
from .llama_cpp import *
|
||||||
from .llama import *
|
from .llama import *
|
||||||
|
|
||||||
__version__ = "0.2.76"
|
__version__ = "0.2.77"
|
|
@ -128,9 +128,9 @@ class _LlamaModel:
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
return llama_cpp.llama_token_get_score(self.model, token)
|
return llama_cpp.llama_token_get_score(self.model, token)
|
||||||
|
|
||||||
def token_get_type(self, token: int) -> int:
|
def token_get_attr(self, token: int) -> int:
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
return llama_cpp.llama_token_get_type(self.model, token)
|
return llama_cpp.llama_token_get_attr(self.model, token)
|
||||||
|
|
||||||
# Special tokens
|
# Special tokens
|
||||||
|
|
||||||
|
@ -142,6 +142,14 @@ class _LlamaModel:
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
return llama_cpp.llama_token_eos(self.model)
|
return llama_cpp.llama_token_eos(self.model)
|
||||||
|
|
||||||
|
def token_cls(self) -> int:
|
||||||
|
assert self.model is not None
|
||||||
|
return llama_cpp.llama_token_cls(self.model)
|
||||||
|
|
||||||
|
def token_sep(self) -> int:
|
||||||
|
assert self.model is not None
|
||||||
|
return llama_cpp.llama_token_sep(self.model)
|
||||||
|
|
||||||
def token_nl(self) -> int:
|
def token_nl(self) -> int:
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
return llama_cpp.llama_token_nl(self.model)
|
return llama_cpp.llama_token_nl(self.model)
|
||||||
|
@ -545,13 +553,12 @@ class _LlamaBatch:
|
||||||
class _LlamaTokenDataArray:
|
class _LlamaTokenDataArray:
|
||||||
def __init__(self, *, n_vocab: int):
|
def __init__(self, *, n_vocab: int):
|
||||||
self.n_vocab = n_vocab
|
self.n_vocab = n_vocab
|
||||||
self.candidates_data = np.array(
|
self.candidates_data = np.recarray(
|
||||||
[],
|
(self.n_vocab,),
|
||||||
dtype=np.dtype(
|
dtype=np.dtype(
|
||||||
[("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
|
[("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
self.candidates_data.resize(3, self.n_vocab, refcheck=False)
|
|
||||||
self.candidates = llama_cpp.llama_token_data_array(
|
self.candidates = llama_cpp.llama_token_data_array(
|
||||||
data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
|
data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
|
||||||
size=self.n_vocab,
|
size=self.n_vocab,
|
||||||
|
@ -561,14 +568,11 @@ class _LlamaTokenDataArray:
|
||||||
self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
|
self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
|
||||||
|
|
||||||
def copy_logits(self, logits: npt.NDArray[np.single]):
|
def copy_logits(self, logits: npt.NDArray[np.single]):
|
||||||
self.candidates_data["id"][:] = self.default_candidates_data_id
|
self.candidates_data.id[:] = self.default_candidates_data_id
|
||||||
self.candidates_data["logit"][:] = logits
|
self.candidates_data.logit[:] = logits
|
||||||
self.candidates_data["p"][:] = self.default_candidates_data_p
|
self.candidates_data.p[:] = self.default_candidates_data_p
|
||||||
self.candidates.data = self.candidates_data.ctypes.data_as(
|
self.candidates.sorted = False
|
||||||
llama_cpp.llama_token_data_p
|
self.candidates.size = self.n_vocab
|
||||||
)
|
|
||||||
self.candidates.sorted = ctypes.c_bool(False)
|
|
||||||
self.candidates.size = ctypes.c_size_t(self.n_vocab)
|
|
||||||
|
|
||||||
|
|
||||||
# Python wrappers over common/common
|
# Python wrappers over common/common
|
||||||
|
@ -759,14 +763,14 @@ class _LlamaSamplingContext:
|
||||||
self.params.penalty_present,
|
self.params.penalty_present,
|
||||||
)
|
)
|
||||||
if not self.params.penalize_nl:
|
if not self.params.penalize_nl:
|
||||||
token_data_array.candidates_data["logit"][nl_token] = nl_logit
|
token_data_array.candidates_data.logit[nl_token] = nl_logit
|
||||||
|
|
||||||
if self.grammar is not None:
|
if self.grammar is not None:
|
||||||
ctx_main.sample_grammar(token_data_array, self.grammar)
|
ctx_main.sample_grammar(token_data_array, self.grammar)
|
||||||
|
|
||||||
if self.params.temp < 0:
|
if self.params.temp < 0:
|
||||||
ctx_main.sample_softmax(token_data_array)
|
ctx_main.sample_softmax(token_data_array)
|
||||||
id = token_data_array.candidates_data["id"][0]
|
id = token_data_array.candidates_data.id[0]
|
||||||
elif self.params.temp == 0:
|
elif self.params.temp == 0:
|
||||||
id = ctx_main.sample_token_greedy(token_data_array)
|
id = ctx_main.sample_token_greedy(token_data_array)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -8,6 +8,7 @@ import json
|
||||||
import ctypes
|
import ctypes
|
||||||
import typing
|
import typing
|
||||||
import fnmatch
|
import fnmatch
|
||||||
|
import warnings
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
|
||||||
from typing import (
|
from typing import (
|
||||||
|
@ -71,6 +72,7 @@ class Llama:
|
||||||
split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
|
split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
|
||||||
main_gpu: int = 0,
|
main_gpu: int = 0,
|
||||||
tensor_split: Optional[List[float]] = None,
|
tensor_split: Optional[List[float]] = None,
|
||||||
|
rpc_servers: Optional[str] = None,
|
||||||
vocab_only: bool = False,
|
vocab_only: bool = False,
|
||||||
use_mmap: bool = True,
|
use_mmap: bool = True,
|
||||||
use_mlock: bool = False,
|
use_mlock: bool = False,
|
||||||
|
@ -149,6 +151,7 @@ class Llama:
|
||||||
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
|
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
|
||||||
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
|
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
|
||||||
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
|
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
|
||||||
|
rpc_servers: Comma separated list of RPC servers to use for offloading
|
||||||
vocab_only: Only load the vocabulary no weights.
|
vocab_only: Only load the vocabulary no weights.
|
||||||
use_mmap: Use mmap if possible.
|
use_mmap: Use mmap if possible.
|
||||||
use_mlock: Force the system to keep the model in RAM.
|
use_mlock: Force the system to keep the model in RAM.
|
||||||
|
@ -220,6 +223,11 @@ class Llama:
|
||||||
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
|
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
|
||||||
self.model_params.split_mode = split_mode
|
self.model_params.split_mode = split_mode
|
||||||
self.model_params.main_gpu = main_gpu
|
self.model_params.main_gpu = main_gpu
|
||||||
|
if rpc_servers is not None:
|
||||||
|
self.model_params.rpc_servers = rpc_servers.encode('utf-8')
|
||||||
|
self._rpc_servers = rpc_servers
|
||||||
|
else:
|
||||||
|
self._rpc_servers = None
|
||||||
self.tensor_split = tensor_split
|
self.tensor_split = tensor_split
|
||||||
self._c_tensor_split = None
|
self._c_tensor_split = None
|
||||||
if self.tensor_split is not None:
|
if self.tensor_split is not None:
|
||||||
|
@ -1019,6 +1027,12 @@ class Llama:
|
||||||
)
|
)
|
||||||
model_name: str = model if model is not None else self.model_path
|
model_name: str = model if model is not None else self.model_path
|
||||||
|
|
||||||
|
if prompt_tokens[:2] == [self.token_bos()] * 2:
|
||||||
|
warnings.warn(
|
||||||
|
f'Detected duplicate leading "{self._model.token_get_text(self.token_bos())}" in prompt, this will likely reduce response quality, consider removing it...',
|
||||||
|
RuntimeWarning,
|
||||||
|
)
|
||||||
|
|
||||||
# NOTE: This likely doesn't work correctly for the first token in the prompt
|
# NOTE: This likely doesn't work correctly for the first token in the prompt
|
||||||
# because of the extra space added to the start of the prompt_tokens
|
# because of the extra space added to the start of the prompt_tokens
|
||||||
if logit_bias is not None:
|
if logit_bias is not None:
|
||||||
|
@ -1403,8 +1417,8 @@ class Llama:
|
||||||
top_logprobs: List[Optional[Dict[str, float]]] = []
|
top_logprobs: List[Optional[Dict[str, float]]] = []
|
||||||
|
|
||||||
if echo:
|
if echo:
|
||||||
# Remove leading BOS token
|
# Remove leading BOS token if exists
|
||||||
all_tokens = prompt_tokens[1:] + completion_tokens
|
all_tokens = prompt_tokens[1 if prompt_tokens[0] == self.token_bos() else 0:] + completion_tokens
|
||||||
else:
|
else:
|
||||||
all_tokens = completion_tokens
|
all_tokens = completion_tokens
|
||||||
|
|
||||||
|
|
|
@ -160,6 +160,7 @@ class ChatFormatterResponse:
|
||||||
prompt: str
|
prompt: str
|
||||||
stop: Optional[Union[str, List[str]]] = None
|
stop: Optional[Union[str, List[str]]] = None
|
||||||
stopping_criteria: Optional[llama.StoppingCriteriaList] = None
|
stopping_criteria: Optional[llama.StoppingCriteriaList] = None
|
||||||
|
added_special: bool = False
|
||||||
|
|
||||||
|
|
||||||
class ChatFormatter(Protocol):
|
class ChatFormatter(Protocol):
|
||||||
|
@ -232,7 +233,7 @@ class Jinja2ChatFormatter(ChatFormatter):
|
||||||
return tokens[-1] in self.stop_token_ids
|
return tokens[-1] in self.stop_token_ids
|
||||||
stopping_criteria = llama.StoppingCriteriaList([stop_on_last_token])
|
stopping_criteria = llama.StoppingCriteriaList([stop_on_last_token])
|
||||||
|
|
||||||
return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token], stopping_criteria=stopping_criteria)
|
return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token], stopping_criteria=stopping_criteria, added_special=True)
|
||||||
|
|
||||||
def to_chat_handler(self) -> LlamaChatCompletionHandler:
|
def to_chat_handler(self) -> LlamaChatCompletionHandler:
|
||||||
return chat_formatter_to_chat_completion_handler(self)
|
return chat_formatter_to_chat_completion_handler(self)
|
||||||
|
@ -548,7 +549,7 @@ def chat_formatter_to_chat_completion_handler(
|
||||||
tools=tools,
|
tools=tools,
|
||||||
tool_choice=tool_choice,
|
tool_choice=tool_choice,
|
||||||
)
|
)
|
||||||
prompt = result.prompt
|
prompt = llama.tokenize(result.prompt.encode("utf-8"), add_bos=not result.added_special, special=True)
|
||||||
if result.stop is not None:
|
if result.stop is not None:
|
||||||
stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
|
stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
|
||||||
rstop = result.stop if isinstance(result.stop, list) else [result.stop]
|
rstop = result.stop if isinstance(result.stop, list) else [result.stop]
|
||||||
|
@ -655,7 +656,7 @@ def hf_autotokenizer_to_chat_formatter(
|
||||||
prompt: str = tokenizer.apply_chat_template(messages, tokenize=False) # type: ignore
|
prompt: str = tokenizer.apply_chat_template(messages, tokenize=False) # type: ignore
|
||||||
assert isinstance(prompt, str)
|
assert isinstance(prompt, str)
|
||||||
# Return formatted prompt and eos token by default
|
# Return formatted prompt and eos token by default
|
||||||
return ChatFormatterResponse(prompt=prompt, stop=tokenizer.eos_token)
|
return ChatFormatterResponse(prompt=prompt, stop=tokenizer.eos_token, added_special=True)
|
||||||
|
|
||||||
return format_autotokenizer
|
return format_autotokenizer
|
||||||
|
|
||||||
|
@ -708,7 +709,7 @@ def hf_tokenizer_config_to_chat_formatter(
|
||||||
bos_token=bos_token,
|
bos_token=bos_token,
|
||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
)
|
)
|
||||||
return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token])
|
return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token], added_special=True)
|
||||||
|
|
||||||
return format_tokenizer_config
|
return format_tokenizer_config
|
||||||
|
|
||||||
|
@ -918,7 +919,7 @@ def format_llama2(
|
||||||
messages: List[llama_types.ChatCompletionRequestMessage],
|
messages: List[llama_types.ChatCompletionRequestMessage],
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> ChatFormatterResponse:
|
) -> ChatFormatterResponse:
|
||||||
_system_template = "<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>"
|
_system_template = "[INST] <<SYS>>\n{system_message}\n<</SYS>>"
|
||||||
_roles = dict(user="<s>[INST]", assistant="[/INST]")
|
_roles = dict(user="<s>[INST]", assistant="[/INST]")
|
||||||
_messages = _map_roles(messages, _roles)
|
_messages = _map_roles(messages, _roles)
|
||||||
system_message = _get_system_message(messages)
|
system_message = _get_system_message(messages)
|
||||||
|
@ -940,11 +941,10 @@ def format_llama3(
|
||||||
user="<|start_header_id|>user<|end_header_id|>\n\n",
|
user="<|start_header_id|>user<|end_header_id|>\n\n",
|
||||||
assistant="<|start_header_id|>assistant<|end_header_id|>\n\n",
|
assistant="<|start_header_id|>assistant<|end_header_id|>\n\n",
|
||||||
)
|
)
|
||||||
_begin_token = "<|begin_of_text|>"
|
|
||||||
_sep = "<|eot_id|>"
|
_sep = "<|eot_id|>"
|
||||||
_messages = _map_roles(messages, _roles)
|
_messages = _map_roles(messages, _roles)
|
||||||
_messages.append((_roles["assistant"], None))
|
_messages.append((_roles["assistant"], None))
|
||||||
_prompt = _format_no_colon_single(_begin_token, _messages, _sep)
|
_prompt = _format_no_colon_single("", _messages, _sep)
|
||||||
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
|
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1229,10 +1229,9 @@ def format_mistral_instruct(
|
||||||
messages: List[llama_types.ChatCompletionRequestMessage],
|
messages: List[llama_types.ChatCompletionRequestMessage],
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> ChatFormatterResponse:
|
) -> ChatFormatterResponse:
|
||||||
bos = "<s>"
|
|
||||||
eos = "</s>"
|
eos = "</s>"
|
||||||
stop = eos
|
stop = eos
|
||||||
prompt = bos
|
prompt = ""
|
||||||
for message in messages:
|
for message in messages:
|
||||||
if (
|
if (
|
||||||
message["role"] == "user"
|
message["role"] == "user"
|
||||||
|
@ -2642,13 +2641,13 @@ class Llava15ChatHandler:
|
||||||
if type_ == "text":
|
if type_ == "text":
|
||||||
tokens = llama.tokenize(value.encode("utf8"), add_bos=False, special=True)
|
tokens = llama.tokenize(value.encode("utf8"), add_bos=False, special=True)
|
||||||
if llama.n_tokens + len(tokens) > llama.n_ctx():
|
if llama.n_tokens + len(tokens) > llama.n_ctx():
|
||||||
raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
|
raise ValueError(f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}")
|
||||||
llama.eval(tokens)
|
llama.eval(tokens)
|
||||||
else:
|
else:
|
||||||
image_bytes = self.load_image(value)
|
image_bytes = self.load_image(value)
|
||||||
embed = embed_image_bytes(image_bytes)
|
embed = embed_image_bytes(image_bytes)
|
||||||
if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
|
if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
|
||||||
raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
|
raise ValueError(f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}")
|
||||||
n_past = ctypes.c_int(llama.n_tokens)
|
n_past = ctypes.c_int(llama.n_tokens)
|
||||||
n_past_p = ctypes.pointer(n_past)
|
n_past_p = ctypes.pointer(n_past)
|
||||||
with suppress_stdout_stderr(disable=self.verbose):
|
with suppress_stdout_stderr(disable=self.verbose):
|
||||||
|
|
|
@ -333,7 +333,7 @@ LLAMA_ROPE_TYPE_NEOX = 2
|
||||||
LLAMA_ROPE_TYPE_GLM = 4
|
LLAMA_ROPE_TYPE_GLM = 4
|
||||||
|
|
||||||
|
|
||||||
# enum llama_token_type {
|
# enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
||||||
# LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
# LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
||||||
# LLAMA_TOKEN_TYPE_NORMAL = 1,
|
# LLAMA_TOKEN_TYPE_NORMAL = 1,
|
||||||
# LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
# LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
||||||
|
@ -351,6 +351,32 @@ LLAMA_TOKEN_TYPE_UNUSED = 5
|
||||||
LLAMA_TOKEN_TYPE_BYTE = 6
|
LLAMA_TOKEN_TYPE_BYTE = 6
|
||||||
|
|
||||||
|
|
||||||
|
# enum llama_token_attr {
|
||||||
|
# LLAMA_TOKEN_ATTR_UNDEFINED = 0,
|
||||||
|
# LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0,
|
||||||
|
# LLAMA_TOKEN_ATTR_UNUSED = 1 << 1,
|
||||||
|
# LLAMA_TOKEN_ATTR_NORMAL = 1 << 2,
|
||||||
|
# LLAMA_TOKEN_ATTR_CONTROL = 1 << 3, // SPECIAL?
|
||||||
|
# LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
|
||||||
|
# LLAMA_TOKEN_ATTR_BYTE = 1 << 5,
|
||||||
|
# LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6,
|
||||||
|
# LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7,
|
||||||
|
# LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8,
|
||||||
|
# LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9,
|
||||||
|
# };
|
||||||
|
LLAMA_TOKEN_ATTR_UNDEFINED = 0
|
||||||
|
LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0
|
||||||
|
LLAMA_TOKEN_ATTR_UNUSED = 1 << 1
|
||||||
|
LLAMA_TOKEN_ATTR_NORMAL = 1 << 2
|
||||||
|
LLAMA_TOKEN_ATTR_CONTROL = 1 << 3
|
||||||
|
LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4
|
||||||
|
LLAMA_TOKEN_ATTR_BYTE = 1 << 5
|
||||||
|
LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6
|
||||||
|
LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7
|
||||||
|
LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8
|
||||||
|
LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9
|
||||||
|
|
||||||
|
|
||||||
# // model file types
|
# // model file types
|
||||||
# enum llama_ftype {
|
# enum llama_ftype {
|
||||||
# LLAMA_FTYPE_ALL_F32 = 0,
|
# LLAMA_FTYPE_ALL_F32 = 0,
|
||||||
|
@ -959,6 +985,9 @@ llama_grammar_p = ctypes.c_void_p
|
||||||
# // modifies a preceding LLAMA_GRETYPE_CHAR or
|
# // modifies a preceding LLAMA_GRETYPE_CHAR or
|
||||||
# // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
# // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
||||||
# LLAMA_GRETYPE_CHAR_ALT = 6,
|
# LLAMA_GRETYPE_CHAR_ALT = 6,
|
||||||
|
|
||||||
|
# // any character (.)
|
||||||
|
# LLAMA_GRETYPE_CHAR_ANY = 7,
|
||||||
# };
|
# };
|
||||||
LLAMA_GRETYPE_END = 0
|
LLAMA_GRETYPE_END = 0
|
||||||
LLAMA_GRETYPE_ALT = 1
|
LLAMA_GRETYPE_ALT = 1
|
||||||
|
@ -967,6 +996,7 @@ LLAMA_GRETYPE_CHAR = 3
|
||||||
LLAMA_GRETYPE_CHAR_NOT = 4
|
LLAMA_GRETYPE_CHAR_NOT = 4
|
||||||
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5
|
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5
|
||||||
LLAMA_GRETYPE_CHAR_ALT = 6
|
LLAMA_GRETYPE_CHAR_ALT = 6
|
||||||
|
LLAMA_GRETYPE_CHAR_ANY = 7
|
||||||
|
|
||||||
|
|
||||||
# typedef struct llama_grammar_element {
|
# typedef struct llama_grammar_element {
|
||||||
|
@ -1233,12 +1263,12 @@ def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
|
||||||
def llama_pooling_type(ctx: llama_context_p, /) -> int: ...
|
def llama_pooling_type(ctx: llama_context_p, /) -> int: ...
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
# LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
||||||
@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int)
|
@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int)
|
||||||
def llama_vocab_type(model: llama_model_p, /) -> int: ...
|
def llama_vocab_type(model: llama_model_p, /) -> int: ...
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
# LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||||
@ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int)
|
@ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int)
|
||||||
def llama_rope_type(model: llama_model_p, /) -> int: ...
|
def llama_rope_type(model: llama_model_p, /) -> int: ...
|
||||||
|
|
||||||
|
@ -2438,11 +2468,11 @@ def llama_token_get_score(
|
||||||
) -> float: ...
|
) -> float: ...
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
# LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
|
||||||
@ctypes_function(
|
@ctypes_function(
|
||||||
"llama_token_get_type", [llama_model_p_ctypes, llama_token], ctypes.c_int
|
"llama_token_get_attr", [llama_model_p_ctypes, llama_token], ctypes.c_int
|
||||||
)
|
)
|
||||||
def llama_token_get_type(
|
def llama_token_get_attr(
|
||||||
model: llama_model_p, token: Union[llama_token, int], /
|
model: llama_model_p, token: Union[llama_token, int], /
|
||||||
) -> int: ...
|
) -> int: ...
|
||||||
|
|
||||||
|
@ -3200,104 +3230,9 @@ def llama_grammar_accept_token(
|
||||||
|
|
||||||
|
|
||||||
# //
|
# //
|
||||||
# // Beam search
|
# // Model split
|
||||||
# //
|
# //
|
||||||
|
|
||||||
# struct llama_beam_view {
|
|
||||||
# const llama_token * tokens;
|
|
||||||
|
|
||||||
|
|
||||||
# size_t n_tokens;
|
|
||||||
# float p; // Cumulative beam probability (renormalized relative to all beams)
|
|
||||||
# bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
|
||||||
# };
|
|
||||||
class llama_beam_view(ctypes.Structure):
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
tokens: CtypesArray[llama_token]
|
|
||||||
n_tokens: int
|
|
||||||
p: float
|
|
||||||
eob: bool
|
|
||||||
|
|
||||||
_fields_ = [
|
|
||||||
("tokens", llama_token_p),
|
|
||||||
("n_tokens", ctypes.c_size_t),
|
|
||||||
("p", ctypes.c_float),
|
|
||||||
("eob", ctypes.c_bool),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# // Passed to beam_search_callback function.
|
|
||||||
# // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
|
||||||
# // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
|
||||||
# // These pointers are valid only during the synchronous callback, so should not be saved.
|
|
||||||
# struct llama_beams_state {
|
|
||||||
# struct llama_beam_view * beam_views;
|
|
||||||
# size_t n_beams; // Number of elements in beam_views[].
|
|
||||||
# size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
|
||||||
# bool last_call; // True iff this is the last callback invocation.
|
|
||||||
# };
|
|
||||||
class llama_beams_state(ctypes.Structure):
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
beam_views: CtypesArray[llama_beam_view]
|
|
||||||
n_beams: int
|
|
||||||
common_prefix_length: int
|
|
||||||
last_call: bool
|
|
||||||
|
|
||||||
_fields_ = [
|
|
||||||
("beam_views", ctypes.POINTER(llama_beam_view)),
|
|
||||||
("n_beams", ctypes.c_size_t),
|
|
||||||
("common_prefix_length", ctypes.c_size_t),
|
|
||||||
("last_call", ctypes.c_bool),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# // Type of pointer to the beam_search_callback function.
|
|
||||||
# // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
|
||||||
# // passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
|
||||||
# typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
|
|
||||||
llama_beam_search_callback_fn_t = ctypes.CFUNCTYPE(
|
|
||||||
None, ctypes.c_void_p, llama_beams_state
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# /// @details Deterministically returns entire sentence constructed by a beam search.
|
|
||||||
# /// @param ctx Pointer to the llama_context.
|
|
||||||
# /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
|
||||||
# /// @param callback_data A pointer that is simply passed back to callback.
|
|
||||||
# /// @param n_beams Number of beams to use.
|
|
||||||
# /// @param n_past Number of tokens already evaluated.
|
|
||||||
# /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
|
||||||
# /// @param n_threads Number of threads as passed to llama_eval().
|
|
||||||
# LLAMA_API void llama_beam_search(
|
|
||||||
# struct llama_context * ctx,
|
|
||||||
# llama_beam_search_callback_fn_t callback,
|
|
||||||
# void * callback_data,
|
|
||||||
# size_t n_beams,
|
|
||||||
# int32_t n_past,
|
|
||||||
# int32_t n_predict);
|
|
||||||
@ctypes_function(
|
|
||||||
"llama_beam_search",
|
|
||||||
[
|
|
||||||
llama_context_p_ctypes,
|
|
||||||
llama_beam_search_callback_fn_t,
|
|
||||||
ctypes.c_void_p,
|
|
||||||
ctypes.c_size_t,
|
|
||||||
ctypes.c_int32,
|
|
||||||
ctypes.c_int32,
|
|
||||||
],
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
def llama_beam_search(
|
|
||||||
ctx: llama_context_p,
|
|
||||||
callback: CtypesFuncPointer,
|
|
||||||
callback_data: ctypes.c_void_p,
|
|
||||||
n_beams: Union[ctypes.c_size_t, int],
|
|
||||||
n_past: Union[ctypes.c_int, int],
|
|
||||||
n_predict: Union[ctypes.c_int, int],
|
|
||||||
/,
|
|
||||||
): ...
|
|
||||||
|
|
||||||
|
|
||||||
# /// @details Build a split GGUF final path for this chunk.
|
# /// @details Build a split GGUF final path for this chunk.
|
||||||
# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
||||||
# // Returns the split_path length.
|
# // Returns the split_path length.
|
||||||
|
|
|
@ -226,6 +226,7 @@ class LlamaProxy:
|
||||||
use_mmap=settings.use_mmap,
|
use_mmap=settings.use_mmap,
|
||||||
use_mlock=settings.use_mlock,
|
use_mlock=settings.use_mlock,
|
||||||
kv_overrides=kv_overrides,
|
kv_overrides=kv_overrides,
|
||||||
|
rpc_servers=settings.rpc_servers,
|
||||||
# Context Params
|
# Context Params
|
||||||
seed=settings.seed,
|
seed=settings.seed,
|
||||||
n_ctx=settings.n_ctx,
|
n_ctx=settings.n_ctx,
|
||||||
|
|
|
@ -58,6 +58,10 @@ class ModelSettings(BaseSettings):
|
||||||
default=None,
|
default=None,
|
||||||
description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
|
description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
|
||||||
)
|
)
|
||||||
|
rpc_servers: Optional[str] = Field(
|
||||||
|
default=None,
|
||||||
|
description="comma seperated list of rpc servers for offloading",
|
||||||
|
)
|
||||||
# Context Params
|
# Context Params
|
||||||
seed: int = Field(
|
seed: int = Field(
|
||||||
default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
|
default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
|
||||||
|
|
|
@ -21,12 +21,13 @@ def test_mistral_instruct():
|
||||||
response = llama_chat_format.format_mistral_instruct(
|
response = llama_chat_format.format_mistral_instruct(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
)
|
)
|
||||||
|
prompt = ("" if response.added_special else "<s>") + response.prompt
|
||||||
reference = chat_formatter.render(
|
reference = chat_formatter.render(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
bos_token="<s>",
|
bos_token="<s>",
|
||||||
eos_token="</s>",
|
eos_token="</s>",
|
||||||
)
|
)
|
||||||
assert response.prompt == reference
|
assert prompt == reference
|
||||||
|
|
||||||
|
|
||||||
mistral_7b_tokenizer_config = """{
|
mistral_7b_tokenizer_config = """{
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 504f0c340f6b5e04de682f6ddefdd3b81208df5d
|
Subproject commit 5795b941827fdec6c1662986de962badff456718
|
Loading…
Reference in a new issue