Compare commits
48 commits
1d177aaaef
...
5b4ad6f4d1
Author | SHA1 | Date | |
---|---|---|---|
5b4ad6f4d1 | |||
|
3dbfec74e7 | ||
|
d8a3b013c3 | ||
|
03f171e810 | ||
|
b564d05806 | ||
|
d99a6ba607 | ||
|
e811a81066 | ||
|
ca8e3c967d | ||
|
5212fb08ae | ||
|
389e09c2f5 | ||
|
4b54f79330 | ||
|
50f5c74ecf | ||
|
43ba1526c8 | ||
|
3f8e17af63 | ||
|
3c19faa0d4 | ||
|
3fe8e9a8f3 | ||
|
9dc5e20fb6 | ||
|
1547202b77 | ||
|
7f59856fa6 | ||
|
73165021bb | ||
|
eafb6ec5e8 | ||
|
ac55d0a175 | ||
|
4badac3a60 | ||
|
561e880654 | ||
|
b454f40a9a | ||
|
5ab40e6167 | ||
|
bf66a283e8 | ||
|
3757328b70 | ||
|
77122638b4 | ||
|
2a39b99575 | ||
|
9ce5cb376a | ||
|
4a7122d22f | ||
|
228949c1f7 | ||
|
903b28adf5 | ||
|
07966b9ba7 | ||
|
a50d24e3a7 | ||
|
0318702cdc | ||
|
3666833107 | ||
|
3e2597eac8 | ||
|
e0d7674e62 | ||
|
1f56c648c3 | ||
|
f9b7221c8f | ||
|
9f7a85571a | ||
|
0a454bebe6 | ||
|
2138561fab | ||
|
2117122396 | ||
|
d75dea18db | ||
|
31b1d95a6c |
20 changed files with 807 additions and 193 deletions
8
.github/dependabot.yml
vendored
8
.github/dependabot.yml
vendored
|
@ -8,8 +8,12 @@ updates:
|
|||
- package-ecosystem: "pip" # See documentation for possible values
|
||||
directory: "/" # Location of package manifests
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
interval: "daily"
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
interval: "daily"
|
||||
- package-ecosystem: "docker"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
|
|
4
.github/workflows/build-and-release.yaml
vendored
4
.github/workflows/build-and-release.yaml
vendored
|
@ -29,7 +29,7 @@ jobs:
|
|||
python -m pip install -e .[all]
|
||||
|
||||
- name: Build wheels
|
||||
uses: pypa/cibuildwheel@v2.17.0
|
||||
uses: pypa/cibuildwheel@v2.18.0
|
||||
env:
|
||||
# disable repair
|
||||
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||
|
@ -56,7 +56,7 @@ jobs:
|
|||
platforms: linux/arm64
|
||||
|
||||
- name: Build wheels
|
||||
uses: pypa/cibuildwheel@v2.17.0
|
||||
uses: pypa/cibuildwheel@v2.18.0
|
||||
env:
|
||||
CIBW_SKIP: "*musllinux* pp*"
|
||||
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||
|
|
54
CHANGELOG.md
54
CHANGELOG.md
|
@ -7,9 +7,61 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.2.75]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@13ad16af1231ab2d245d35df3295bcfa23de1305
|
||||
- fix: segfault for models without eos / bos tokens by @abetlen in d99a6ba607a4885fb00e63e967964aa41bdbbbcb
|
||||
- feat: add MinTokensLogitProcessor and min_tokens argument to server by @twaka in #1333
|
||||
- misc: Remove unnecessary metadata lookups by @CISC in #1448
|
||||
|
||||
## [0.2.74]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@b228aba91ac2cd9eb90e9d423ba1d0d20e0117e2
|
||||
- fix: Enable CUDA backend for llava by @abetlen in 7f59856fa6f3e23f07e12fc15aeb9359dc6c3bb4
|
||||
- docs: Fix typo in README.md by @yupbank in #1444
|
||||
|
||||
## [0.2.73]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@25c6e82e7a1ad25a42b0894e87d9b5c557409516
|
||||
- fix: Clear kv cache at beginning of image chat formats to avoid bug when image is evaluated first by @abetlen in ac55d0a175115d1e719672ce1cb1bec776c738b1
|
||||
|
||||
## [0.2.72]
|
||||
|
||||
- fix(security): Remote Code Execution by Server-Side Template Injection in Model Metadata by @retr0reg in b454f40a9a1787b2b5659cd2cb00819d983185df
|
||||
- fix(security): Update remaining jinja chat templates to use immutable sandbox by @CISC in #1441
|
||||
|
||||
## [0.2.71]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@911b3900dded9a1cfe0f0e41b82c7a29baf3a217
|
||||
- fix: Make leading bos_token optional for image chat formats, fix nanollava system message by @abetlen in 77122638b4153e31d9f277b3d905c2900b536632
|
||||
- fix: free last image embed in llava chat handler by @abetlen in 3757328b703b2cd32dcbd5853271e3a8c8599fe7
|
||||
|
||||
## [0.2.70]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@c0e6fbf8c380718102bd25fcb8d2e55f8f9480d1
|
||||
- feat: fill-in-middle support by @CISC in #1386
|
||||
- fix: adding missing args in create_completion for functionary chat handler by @skalade in #1430
|
||||
- docs: update README.md @eltociear in #1432
|
||||
- fix: chat_format log where auto-detected format prints None by @balvisio in #1434
|
||||
- feat(server): Add support for setting root_path by @abetlen in 0318702cdc860999ee70f277425edbbfe0e60419
|
||||
- feat(ci): Add docker checks and check deps more frequently by @Smartappli in #1426
|
||||
- fix: detokenization case where first token does not start with a leading space by @noamgat in #1375
|
||||
- feat: Implement streaming for Functionary v2 + Bug fixes by @jeffrey-fong in #1419
|
||||
- fix: Use memmove to copy str_value kv_override by @abetlen in 9f7a85571ae80d3b6ddbd3e1bae407b9f1e3448a
|
||||
- feat(server): Remove temperature bounds checks for server by @abetlen in 0a454bebe67d12a446981eb16028c168ca5faa81
|
||||
- fix(server): Propagate flash_attn to model load by @dthuerck in #1424
|
||||
|
||||
## [0.2.69]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
|
||||
- feat: Add llama-3-vision-alpha chat format by @abetlen in 31b1d95a6c19f5b615a3286069f181a415f872e8
|
||||
- fix: Change default verbose value of verbose in image chat format handlers to True to match Llama by @abetlen in 4f01c452b6c738dc56eacac3758119b12c57ea94
|
||||
- fix: Suppress all logs when verbose=False, use hardcoded fileno's to work in colab notebooks by @abetlen in f116175a5a7c84569c88cad231855c1e6e59ff6e
|
||||
- fix: UTF-8 handling with grammars by @jsoma in #1415
|
||||
|
||||
## [0.2.68]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@77e15bec6217a39be59b9cc83d6b9afb6b0d8167
|
||||
- feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
|
||||
- fix(ci): Fix build-and-release.yaml by @Smartappli in #1413
|
||||
|
||||
|
|
|
@ -51,8 +51,9 @@ if (LLAMA_BUILD)
|
|||
)
|
||||
|
||||
if (LLAVA_BUILD)
|
||||
if (LLAMA_CUBLAS)
|
||||
if (LLAMA_CUBLAS OR LLAMA_CUDA)
|
||||
add_compile_definitions(GGML_USE_CUBLAS)
|
||||
add_compile_definitions(GGML_USE_CUDA)
|
||||
endif()
|
||||
|
||||
if (LLAMA_METAL)
|
||||
|
|
2
Makefile
2
Makefile
|
@ -16,7 +16,7 @@ build.debug:
|
|||
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false --editable .
|
||||
|
||||
build.cuda:
|
||||
CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --verbose -e .
|
||||
CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .
|
||||
|
||||
build.opencl:
|
||||
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
|
||||
|
|
|
@ -516,7 +516,7 @@ chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
|
|||
llm = Llama(
|
||||
model_path="./path/to/llava/llama-model.gguf",
|
||||
chat_handler=chat_handler,
|
||||
n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
|
||||
n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
|
||||
)
|
||||
llm.create_chat_completion(
|
||||
messages = [
|
||||
|
@ -547,10 +547,10 @@ llm = Llama.from_pretrained(
|
|||
repo_id="vikhyatk/moondream2",
|
||||
filename="*text-model*",
|
||||
chat_handler=chat_handler,
|
||||
n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
|
||||
n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
|
||||
)
|
||||
|
||||
respoonse = llm.create_chat_completion(
|
||||
response = llm.create_chat_completion(
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
|
|
19
examples/ray/README.md
Normal file
19
examples/ray/README.md
Normal file
|
@ -0,0 +1,19 @@
|
|||
This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html).
|
||||
|
||||
First, install the requirements:
|
||||
|
||||
```bash
|
||||
$ pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Deploy a GGUF model to Ray Serve with the following command:
|
||||
|
||||
```bash
|
||||
$ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf'
|
||||
```
|
||||
|
||||
This will start an API endpoint at `http://localhost:8000/`. You can query the model like this:
|
||||
|
||||
```bash
|
||||
$ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000
|
||||
```
|
20
examples/ray/llm.py
Executable file
20
examples/ray/llm.py
Executable file
|
@ -0,0 +1,20 @@
|
|||
from starlette.requests import Request
|
||||
from typing import Dict
|
||||
from ray import serve
|
||||
from ray.serve import Application
|
||||
from llama_cpp import Llama
|
||||
|
||||
@serve.deployment
|
||||
class LlamaDeployment:
|
||||
def __init__(self, model_path: str):
|
||||
self._llm = Llama(model_path=model_path)
|
||||
|
||||
async def __call__(self, http_request: Request) -> Dict:
|
||||
input_json = await http_request.json()
|
||||
prompt = input_json["prompt"]
|
||||
max_tokens = input_json.get("max_tokens", 64)
|
||||
return self._llm(prompt, max_tokens=max_tokens)
|
||||
|
||||
|
||||
def llm_builder(args: Dict[str, str]) -> Application:
|
||||
return LlamaDeployment.bind(args["model_path"])
|
3
examples/ray/requirements.txt
Normal file
3
examples/ray/requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
ray[serve]
|
||||
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
||||
llama-cpp-python
|
|
@ -1,4 +1,4 @@
|
|||
from .llama_cpp import *
|
||||
from .llama import *
|
||||
|
||||
__version__ = "0.2.68"
|
||||
__version__ = "0.2.75"
|
|
@ -203,7 +203,7 @@ class _LlamaModel:
|
|||
# NOTE: Llama1 models automatically added a space at the start of the prompt
|
||||
# this line removes a leading space if the first token is a beginning of sentence token
|
||||
return (
|
||||
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
|
||||
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b' ' else output
|
||||
)
|
||||
|
||||
# Extra
|
||||
|
|
|
@ -262,7 +262,12 @@ class Llama:
|
|||
raise ValueError(f"Value for {k} is too long: {v}")
|
||||
v_bytes = v_bytes.ljust(128, b"\0")
|
||||
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
|
||||
self._kv_overrides_array[i].value.str_value[:128] = v_bytes
|
||||
# copy min(v_bytes, 128) to str_value
|
||||
ctypes.memmove(
|
||||
self._kv_overrides_array[i].value.str_value,
|
||||
v_bytes,
|
||||
min(len(v_bytes), 128),
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown value type for {k}: {v}")
|
||||
|
||||
|
@ -373,6 +378,7 @@ class Llama:
|
|||
|
||||
self.chat_format = chat_format
|
||||
self.chat_handler = chat_handler
|
||||
self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = {}
|
||||
|
||||
self.draft_model = draft_model
|
||||
|
||||
|
@ -404,10 +410,33 @@ class Llama:
|
|||
if self.verbose:
|
||||
print(f"Model metadata: {self.metadata}", file=sys.stderr)
|
||||
|
||||
eos_token_id = self.token_eos()
|
||||
bos_token_id = self.token_bos()
|
||||
|
||||
eos_token = self._model.token_get_text(eos_token_id) if eos_token_id != -1 else ""
|
||||
bos_token = self._model.token_get_text(bos_token_id) if bos_token_id != -1 else ""
|
||||
|
||||
# Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates
|
||||
template_choices = dict((name[10:], template) for name, template in self.metadata.items() if name.startswith("tokenizer.chat_template."))
|
||||
|
||||
if "tokenizer.chat_template" in self.metadata:
|
||||
template_choices["chat_template.default"] = self.metadata["tokenizer.chat_template"]
|
||||
|
||||
if self.verbose and template_choices:
|
||||
print(f"Available chat formats from metadata: {', '.join(template_choices.keys())}", file=sys.stderr)
|
||||
|
||||
for name, template in template_choices.items():
|
||||
self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter(
|
||||
template=template,
|
||||
eos_token=eos_token,
|
||||
bos_token=bos_token,
|
||||
stop_token_ids=[eos_token_id],
|
||||
).to_chat_handler()
|
||||
|
||||
if (
|
||||
self.chat_format is None
|
||||
and self.chat_handler is None
|
||||
and "tokenizer.chat_template" in self.metadata
|
||||
and "chat_template.default" in template_choices
|
||||
):
|
||||
chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata(
|
||||
self.metadata
|
||||
|
@ -418,35 +447,17 @@ class Llama:
|
|||
if self.verbose:
|
||||
print(f"Guessed chat format: {chat_format}", file=sys.stderr)
|
||||
else:
|
||||
template = self.metadata["tokenizer.chat_template"]
|
||||
try:
|
||||
eos_token_id = int(self.metadata["tokenizer.ggml.eos_token_id"])
|
||||
except:
|
||||
eos_token_id = self.token_eos()
|
||||
try:
|
||||
bos_token_id = int(self.metadata["tokenizer.ggml.bos_token_id"])
|
||||
except:
|
||||
bos_token_id = self.token_bos()
|
||||
|
||||
eos_token = self._model.token_get_text(eos_token_id)
|
||||
bos_token = self._model.token_get_text(bos_token_id)
|
||||
|
||||
if self.verbose:
|
||||
print(f"Using gguf chat template: {template}", file=sys.stderr)
|
||||
print(f"Using gguf chat template: {template_choices['chat_template.default']}", file=sys.stderr)
|
||||
print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
|
||||
print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
|
||||
|
||||
self.chat_handler = llama_chat_format.Jinja2ChatFormatter(
|
||||
template=template,
|
||||
eos_token=eos_token,
|
||||
bos_token=bos_token,
|
||||
stop_token_ids=[eos_token_id],
|
||||
).to_chat_handler()
|
||||
self.chat_format = "chat_template.default"
|
||||
|
||||
if self.chat_format is None and self.chat_handler is None:
|
||||
self.chat_format = "llama-2"
|
||||
if self.verbose:
|
||||
print(f"Using fallback chat format: {chat_format}", file=sys.stderr)
|
||||
print(f"Using fallback chat format: {self.chat_format}", file=sys.stderr)
|
||||
|
||||
@property
|
||||
def ctx(self) -> llama_cpp.llama_context_p:
|
||||
|
@ -950,19 +961,54 @@ class Llama:
|
|||
|
||||
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
|
||||
created: int = int(time.time())
|
||||
prefix_token_id: int = self._model.token_prefix()
|
||||
middle_token_id: int = self._model.token_middle()
|
||||
suffix_token_id: int = self._model.token_suffix()
|
||||
# If prompt is empty, initialize completion with BOS token to avoid
|
||||
# detokenization including a space at the beginning of the completion
|
||||
completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
|
||||
# Add blank space to start of prompt to match OG llama tokenizer
|
||||
prompt_tokens: List[int] = (
|
||||
(
|
||||
self.tokenize(prompt.encode("utf-8"), special=True)
|
||||
[prefix_token_id]
|
||||
if prefix_token_id >= 0 and suffix is not None
|
||||
else []
|
||||
)
|
||||
+
|
||||
(
|
||||
(
|
||||
self.tokenize(prompt.encode("utf-8"), add_bos=(prefix_token_id < 0 or suffix is None), special=(prefix_token_id < 0 or suffix is None))
|
||||
if prompt != ""
|
||||
else (
|
||||
[]
|
||||
if prefix_token_id >= 0 and suffix is not None
|
||||
else [self.token_bos()]
|
||||
)
|
||||
)
|
||||
if isinstance(prompt, str)
|
||||
else prompt
|
||||
)
|
||||
+
|
||||
(
|
||||
(
|
||||
[suffix_token_id]
|
||||
+
|
||||
(
|
||||
self.tokenize(suffix.encode("utf-8"), add_bos=False, special=False)
|
||||
if suffix
|
||||
else []
|
||||
)
|
||||
)
|
||||
if suffix_token_id >= 0 and suffix is not None
|
||||
else []
|
||||
)
|
||||
+
|
||||
(
|
||||
[middle_token_id]
|
||||
if middle_token_id >= 0 and suffix is not None
|
||||
else []
|
||||
)
|
||||
)
|
||||
text: bytes = b""
|
||||
returned_tokens: int = 0
|
||||
stop = (
|
||||
|
@ -1341,7 +1387,7 @@ class Llama:
|
|||
if echo:
|
||||
text_str = prompt + text_str
|
||||
|
||||
if suffix is not None:
|
||||
if suffix_token_id < 0 and suffix is not None:
|
||||
text_str = text_str + suffix
|
||||
|
||||
logprobs_or_none: Optional[CompletionLogprobs] = None
|
||||
|
@ -1679,7 +1725,7 @@ class Llama:
|
|||
Returns:
|
||||
Generated chat completion or a stream of chat completion chunks.
|
||||
"""
|
||||
handler = self.chat_handler or llama_chat_format.get_chat_completion_handler(
|
||||
handler = self.chat_handler or self._chat_handlers.get(self.chat_format) or llama_chat_format.get_chat_completion_handler(
|
||||
self.chat_format
|
||||
)
|
||||
return handler(
|
||||
|
@ -2038,3 +2084,19 @@ class StoppingCriteriaList(List[StoppingCriteria]):
|
|||
self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
|
||||
) -> bool:
|
||||
return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
|
||||
|
||||
|
||||
class MinTokensLogitsProcessor(LogitsProcessor):
|
||||
def __init__(self, min_tokens: int, token_eos: int):
|
||||
self.min_tokens = min_tokens
|
||||
self.token_eos = token_eos
|
||||
self.prompt_tokens = None
|
||||
|
||||
def __call__(
|
||||
self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
|
||||
) -> npt.NDArray[np.single]:
|
||||
if self.prompt_tokens is None:
|
||||
self.prompt_tokens = len(input_ids)
|
||||
if len(input_ids) - self.prompt_tokens < self.min_tokens:
|
||||
scores[self.token_eos] = -np.inf
|
||||
return scores
|
||||
|
|
|
@ -11,6 +11,7 @@ from contextlib import ExitStack
|
|||
from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol, cast
|
||||
|
||||
import jinja2
|
||||
from jinja2.sandbox import ImmutableSandboxedEnvironment
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
|
@ -191,7 +192,7 @@ class Jinja2ChatFormatter(ChatFormatter):
|
|||
self.add_generation_prompt = add_generation_prompt
|
||||
self.stop_token_ids = set(stop_token_ids) if stop_token_ids is not None else None
|
||||
|
||||
self._environment = jinja2.Environment(
|
||||
self._environment = ImmutableSandboxedEnvironment(
|
||||
loader=jinja2.BaseLoader(),
|
||||
trim_blocks=True,
|
||||
lstrip_blocks=True,
|
||||
|
@ -684,8 +685,7 @@ def hf_tokenizer_config_to_chat_formatter(
|
|||
assert isinstance(tokenizer_config["eos_token"], str)
|
||||
eos_token = tokenizer_config["eos_token"]
|
||||
|
||||
env = jinja2.Environment(
|
||||
loader=jinja2.BaseLoader(),
|
||||
env = ImmutableSandboxedEnvironment(
|
||||
trim_blocks=True,
|
||||
lstrip_blocks=True,
|
||||
).from_string(chat_template)
|
||||
|
@ -1894,6 +1894,8 @@ def functionary_v1_v2_chat_handler(
|
|||
function_call = (
|
||||
tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
|
||||
)
|
||||
elif function_call is not None:
|
||||
pass
|
||||
else:
|
||||
function_call = "auto"
|
||||
|
||||
|
@ -1930,11 +1932,10 @@ def functionary_v1_v2_chat_handler(
|
|||
logits_processor=logits_processor,
|
||||
grammar=grammar,
|
||||
)
|
||||
if stream is False:
|
||||
completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
|
||||
return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore
|
||||
|
||||
assert stream is False # TODO: support stream mode
|
||||
|
||||
def get_grammar(function_call):
|
||||
function_body = None
|
||||
for function in functions or []:
|
||||
|
@ -1968,7 +1969,7 @@ def functionary_v1_v2_chat_handler(
|
|||
|
||||
return grammar
|
||||
|
||||
def create_completion(stop):
|
||||
def create_completion(prompt, stop, grammar):
|
||||
completion = cast(llama_types.Completion, llama.create_completion(
|
||||
prompt=prompt,
|
||||
temperature=temperature,
|
||||
|
@ -1976,7 +1977,7 @@ def functionary_v1_v2_chat_handler(
|
|||
top_k=top_k,
|
||||
min_p=min_p,
|
||||
typical_p=typical_p,
|
||||
stream=False,
|
||||
stream=stream,
|
||||
stop=stop,
|
||||
max_tokens=max_tokens,
|
||||
presence_penalty=presence_penalty,
|
||||
|
@ -1997,6 +1998,315 @@ def functionary_v1_v2_chat_handler(
|
|||
function_calls, function_bodies = [], []
|
||||
completion_tokens = 0
|
||||
|
||||
def generate_streaming(tools, functions, function_call, prompt):
|
||||
assert version == "v2", "Streaming for v1 is not supported"
|
||||
|
||||
chunk_id, chunk_created = None, None
|
||||
|
||||
# If tool_choice/function_call is provided
|
||||
if isinstance(function_call, dict):
|
||||
prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
|
||||
grammar = get_grammar(function_call["name"])
|
||||
stops = [STOP_TOKEN, FROM_TOKEN]
|
||||
tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
|
||||
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||
completion_text = ""
|
||||
first = True
|
||||
for chunk in completion:
|
||||
# Yield the tool/function name first
|
||||
if first:
|
||||
if tools is not None:
|
||||
func_call_dict = {
|
||||
"tool_calls": [
|
||||
{
|
||||
"index": 0,
|
||||
"id": "call_" + tool_id,
|
||||
"type": "function",
|
||||
"function": {"name": function_call["name"], "arguments": ""},
|
||||
}
|
||||
]
|
||||
}
|
||||
else:
|
||||
func_call_dict = {"function_call": {"name": function_call["name"], "arguments": ""}}
|
||||
yield llama_types.CreateChatCompletionStreamResponse(
|
||||
id="chat" + chunk["id"],
|
||||
object="chat.completion.chunk",
|
||||
created=chunk["created"],
|
||||
model=chunk["model"],
|
||||
choices=[
|
||||
{"index": 0, "logprobs": None, "delta": {"role": None, "content": None, **func_call_dict}}
|
||||
],
|
||||
)
|
||||
first = False
|
||||
if tools is not None:
|
||||
func_call_dict = {
|
||||
"tool_calls": [
|
||||
{
|
||||
"index": 0,
|
||||
"id": "call_" + tool_id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": None,
|
||||
"arguments": chunk["choices"][0]["text"].rstrip(),
|
||||
},
|
||||
}
|
||||
]
|
||||
}
|
||||
else:
|
||||
func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
|
||||
if len(chunk["choices"][0]["text"].rstrip()) > 0:
|
||||
yield llama_types.CreateChatCompletionStreamResponse(
|
||||
id="chat" + chunk["id"],
|
||||
object="chat.completion.chunk",
|
||||
created=chunk["created"],
|
||||
model=chunk["model"],
|
||||
choices=[
|
||||
{
|
||||
"index": 0,
|
||||
"logprobs": chunk["choices"][0]["logprobs"],
|
||||
"delta": {
|
||||
"role": None,
|
||||
"content": None,
|
||||
**func_call_dict,
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
# Yield tool_call/function_call stop message
|
||||
yield llama_types.CreateChatCompletionStreamResponse(
|
||||
id="chat" + chunk["id"],
|
||||
object="chat.completion.chunk",
|
||||
created=chunk["created"],
|
||||
model=chunk["model"],
|
||||
choices=[
|
||||
{
|
||||
"index": 0,
|
||||
"finish_reason": "tool_calls" if tools is not None else "function_call",
|
||||
"logprobs": None,
|
||||
"delta": {
|
||||
"role": None, "content": None, "function_call": None, "tool_calls": None
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
# If "auto" or no tool_choice/function_call
|
||||
elif isinstance(function_call, str) and function_call == "auto":
|
||||
tool_index = 0
|
||||
while True:
|
||||
# Generate function name first
|
||||
grammar = None
|
||||
stops = CONTENT_TOKEN
|
||||
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||
completion_text = ""
|
||||
for chunk in completion:
|
||||
completion_text += chunk["choices"][0]["text"]
|
||||
if chunk_id is None:
|
||||
chunk_id = chunk["id"]
|
||||
if chunk_created is None:
|
||||
chunk_created = chunk["created"]
|
||||
function_name = completion_text.strip()
|
||||
if function_name == "all":
|
||||
prompt += "all\n<|content|>"
|
||||
# Yield the first empty message for content
|
||||
yield llama_types.CreateChatCompletionStreamResponse(
|
||||
id="chat" + chunk_id,
|
||||
model=chunk["model"],
|
||||
created=chunk_created,
|
||||
object="chat.completion.chunk",
|
||||
choices=[
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"role": "assistant", "content": ""},
|
||||
"logprobs": None,
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
)
|
||||
else:
|
||||
prompt += f"{function_name}\n<|content|>"
|
||||
grammar = get_grammar(function_name)
|
||||
tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
|
||||
if tools is not None:
|
||||
func_call_dict = {
|
||||
"tool_calls": [
|
||||
{
|
||||
"index": tool_index,
|
||||
"id": "call_" + tool_id,
|
||||
"type": "function",
|
||||
"function": {"name": function_name, "arguments": ""},
|
||||
}
|
||||
]
|
||||
}
|
||||
else:
|
||||
func_call_dict = {"function_call": {"name": function_name, "arguments": ""}}
|
||||
# Stream function name
|
||||
yield llama_types.CreateChatCompletionStreamResponse(
|
||||
id="chat" + chunk_id,
|
||||
object="chat.completion.chunk",
|
||||
created=chunk_created,
|
||||
model=chunk["model"],
|
||||
choices=[
|
||||
{
|
||||
"index": 0,
|
||||
"logprobs": chunk["choices"][0]["logprobs"],
|
||||
"delta": {
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
**func_call_dict,
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
# Generate content
|
||||
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
|
||||
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||
if function_name == "all":
|
||||
completion_text = ""
|
||||
stop_sequence, buffer, is_end = "\n<|from|>assistant\n<|recipient|>", [], False
|
||||
for i, chunk in enumerate(completion):
|
||||
completion_text += chunk["choices"][0]["text"]
|
||||
if is_end:
|
||||
buffer.append(chunk["choices"][0]["text"].strip(" "))
|
||||
if stop_sequence.startswith("".join(buffer)):
|
||||
continue
|
||||
else:
|
||||
buffer.pop()
|
||||
while len(buffer) > 0:
|
||||
yield llama_types.CreateChatCompletionStreamResponse(
|
||||
id="chat" + chunk_id,
|
||||
object="chat.completion.chunk",
|
||||
created=chunk_created,
|
||||
model=chunk["model"],
|
||||
choices=[
|
||||
{
|
||||
"index": 0,
|
||||
"logprobs": chunk["choices"][0]["logprobs"],
|
||||
"delta": {
|
||||
"role": "assistant", "content": buffer.pop(0)
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
is_end = False
|
||||
elif chunk["choices"][0]["text"] == "\n":
|
||||
is_end = True
|
||||
buffer.append(chunk["choices"][0]["text"].strip(" "))
|
||||
continue
|
||||
|
||||
if len(buffer) == 0 and len(chunk["choices"][0]["text"]) > 0:
|
||||
yield llama_types.CreateChatCompletionStreamResponse(
|
||||
id="chat" + chunk_id,
|
||||
object="chat.completion.chunk",
|
||||
created=chunk_created,
|
||||
model=chunk["model"],
|
||||
choices=[
|
||||
{
|
||||
"index": 0,
|
||||
"logprobs": chunk["choices"][0]["logprobs"],
|
||||
"delta": {
|
||||
"role": "assistant",
|
||||
"content": chunk["choices"][0]["text"] if i > 0 else chunk["choices"][0]["text"].lstrip()
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
# Check whether the model wants to generate another turn
|
||||
if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
|
||||
if completion_text.endswith("\n<|from|>assistant\n"):
|
||||
cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
|
||||
elif completion_text.endswith("\n<|from|> assistant\n"):
|
||||
cleaned_completion_text = completion_text[:-len("\n<|from|> assistant\n")].strip()
|
||||
else:
|
||||
cleaned_completion_text = completion_text.strip()
|
||||
prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
|
||||
else:
|
||||
# Yield stop message
|
||||
yield llama_types.CreateChatCompletionStreamResponse(
|
||||
id="chat" + chunk_id,
|
||||
model=chunk["model"],
|
||||
created=chunk_created,
|
||||
object="chat.completion.chunk",
|
||||
choices=[
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {},
|
||||
"logprobs": None,
|
||||
"finish_reason": "stop",
|
||||
}
|
||||
],
|
||||
)
|
||||
break
|
||||
else:
|
||||
# Check whether the model wants to generate another turn
|
||||
completion_text = ""
|
||||
for chunk in completion:
|
||||
completion_text += chunk["choices"][0]["text"]
|
||||
if len(chunk["choices"][0]["text"].rstrip()) > 0:
|
||||
if tools is not None:
|
||||
func_call_dict = {
|
||||
"tool_calls": [
|
||||
{
|
||||
"index": tool_index,
|
||||
"id": "call_" + tool_id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": None,
|
||||
"arguments": chunk["choices"][0]["text"].rstrip(),
|
||||
},
|
||||
}
|
||||
]
|
||||
}
|
||||
else:
|
||||
func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
|
||||
yield llama_types.CreateChatCompletionStreamResponse(
|
||||
id="chat" + chunk_id,
|
||||
object="chat.completion.chunk",
|
||||
created=chunk_created,
|
||||
model=chunk["model"],
|
||||
choices=[
|
||||
{
|
||||
"index": 0,
|
||||
"logprobs": chunk["choices"][0]["logprobs"],
|
||||
"delta": {
|
||||
"role": None,
|
||||
"content": None,
|
||||
**func_call_dict,
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
prompt += completion_text.strip()
|
||||
grammar = None
|
||||
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||
completion_text += "".join([chunk["choices"][0]["text"] for chunk in completion])
|
||||
if ("<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text) and tools is not None:
|
||||
prompt += "\n<|from|>assistant\n<|recipient|>"
|
||||
tool_index += 1
|
||||
else:
|
||||
# Yield tool_call/function_call stop message
|
||||
yield llama_types.CreateChatCompletionStreamResponse(
|
||||
id="chat" + chunk_id,
|
||||
object="chat.completion.chunk",
|
||||
created=chunk_created,
|
||||
model=chunk["model"],
|
||||
choices=[
|
||||
{
|
||||
"index": 0,
|
||||
"finish_reason": "tool_calls" if tools is not None else "function_call",
|
||||
"logprobs": None,
|
||||
"delta": {
|
||||
"role": None, "content": None, "function_call": None, "tool_calls": None
|
||||
},
|
||||
}
|
||||
],
|
||||
)
|
||||
break
|
||||
|
||||
if stream is not False:
|
||||
return generate_streaming(
|
||||
tools=tools, functions=functions, function_call=function_call, prompt=prompt
|
||||
)
|
||||
else:
|
||||
if version == "v1":
|
||||
# If no or "auto" tool_choice/function_call
|
||||
if isinstance(function_call, str) and function_call == "auto":
|
||||
|
@ -2012,7 +2322,7 @@ def functionary_v1_v2_chat_handler(
|
|||
prompt = prompt
|
||||
stops = ["\n", END_ASSISTANT_TOKEN]
|
||||
|
||||
completion = create_completion(stop=stops)
|
||||
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||
completion_text = completion["choices"][0]["text"]
|
||||
completion_tokens += completion["usage"]["completion_tokens"]
|
||||
|
||||
|
@ -2039,7 +2349,7 @@ def functionary_v1_v2_chat_handler(
|
|||
completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
|
||||
)
|
||||
grammar = get_grammar(function_calls[-1])
|
||||
completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
|
||||
completion = create_completion(prompt=prompt, stop=END_FUNCTION_CALL_TOKEN, grammar=grammar)
|
||||
completion_tokens += completion["usage"]["completion_tokens"]
|
||||
function_bodies.append(completion["choices"][0]["text"].strip())
|
||||
# If the prompt involves a function call, just append generated parameters to function_bodies
|
||||
|
@ -2053,7 +2363,7 @@ def functionary_v1_v2_chat_handler(
|
|||
function_calls.append(function_call)
|
||||
grammar = get_grammar(function_call)
|
||||
stops = [STOP_TOKEN, FROM_TOKEN]
|
||||
completion = create_completion(stop=stops)
|
||||
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||
completion_text = completion["choices"][0]["text"]
|
||||
completion_tokens += completion["usage"]["completion_tokens"]
|
||||
function_bodies.append(completion_text.strip())
|
||||
|
@ -2063,7 +2373,7 @@ def functionary_v1_v2_chat_handler(
|
|||
# Generate function name first
|
||||
grammar = None
|
||||
stops = CONTENT_TOKEN
|
||||
completion = create_completion(stop=stops)
|
||||
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||
completion_text = completion["choices"][0]["text"]
|
||||
completion_tokens += completion["usage"]["completion_tokens"]
|
||||
function_name = completion_text.strip()
|
||||
|
@ -2076,7 +2386,7 @@ def functionary_v1_v2_chat_handler(
|
|||
grammar = get_grammar(function_call)
|
||||
# Generate content
|
||||
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
|
||||
completion = create_completion(stop=stops)
|
||||
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||
completion_text = completion["choices"][0]["text"]
|
||||
completion_tokens += completion["usage"]["completion_tokens"]
|
||||
if function_name == "all":
|
||||
|
@ -2103,7 +2413,7 @@ def functionary_v1_v2_chat_handler(
|
|||
# Check whether the model wants to generate another turn
|
||||
prompt += completion_text.strip()
|
||||
grammar = None
|
||||
completion = create_completion(stop=stops)
|
||||
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||
completion_tokens += completion["usage"]["completion_tokens"]
|
||||
if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
|
||||
prompt += "\n<|from|>assistant\n<|recipient|>"
|
||||
|
@ -2165,7 +2475,7 @@ def functionary_v1_v2_chat_handler(
|
|||
|
||||
|
||||
class Llava15ChatHandler:
|
||||
DEFAULT_SYSTEM_MESSAGE = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
|
||||
DEFAULT_SYSTEM_MESSAGE: Optional[str] = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
|
||||
|
||||
CHAT_FORMAT = (
|
||||
"{% for message in messages %}"
|
||||
|
@ -2288,18 +2598,31 @@ class Llava15ChatHandler:
|
|||
assert self.clip_ctx is not None
|
||||
|
||||
system_prompt = _get_system_message(messages)
|
||||
if system_prompt == "":
|
||||
if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
|
||||
messages = [llama_types.ChatCompletionRequestSystemMessage(role="system", content=self.DEFAULT_SYSTEM_MESSAGE)] + messages
|
||||
|
||||
image_urls = self.get_image_urls(messages)
|
||||
template = jinja2.Template(self.CHAT_FORMAT)
|
||||
text = template.render(messages=messages, add_generation_prompt=True)
|
||||
template = ImmutableSandboxedEnvironment(
|
||||
trim_blocks=True,
|
||||
lstrip_blocks=True,
|
||||
).from_string(self.CHAT_FORMAT)
|
||||
text = template.render(
|
||||
messages=messages,
|
||||
add_generation_prompt=True,
|
||||
eos_token=llama.detokenize([llama.token_eos()]),
|
||||
bos_token=llama.detokenize([llama.token_bos()]),
|
||||
)
|
||||
split_text = self.split_text_on_image_urls(text, image_urls)
|
||||
|
||||
def embed_image_bytes(image_bytes: bytes):
|
||||
if self._last_image_embed is not None and self._last_image_hash is not None and hash(image_bytes) == self._last_image_hash:
|
||||
return self._last_image_embed
|
||||
with suppress_stdout_stderr(disable=self.verbose):
|
||||
# Free the previous image embed
|
||||
if self._last_image_embed is not None:
|
||||
self._llava_cpp.llava_image_embed_free(self._last_image_embed)
|
||||
self._last_image_embed = None
|
||||
self._last_image_hash = None
|
||||
embed = (
|
||||
self._llava_cpp.llava_image_embed_make_with_bytes(
|
||||
self.clip_ctx,
|
||||
|
@ -2314,9 +2637,10 @@ class Llava15ChatHandler:
|
|||
|
||||
# Evaluate prompt
|
||||
llama.reset()
|
||||
for i, (type_, value) in enumerate(split_text):
|
||||
llama._ctx.kv_cache_clear()
|
||||
for type_, value in split_text:
|
||||
if type_ == "text":
|
||||
tokens = llama.tokenize(value.encode("utf8"), add_bos=i == 0)
|
||||
tokens = llama.tokenize(value.encode("utf8"), add_bos=False, special=True)
|
||||
if llama.n_tokens + len(tokens) > llama.n_ctx():
|
||||
raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
|
||||
llama.eval(tokens)
|
||||
|
@ -2334,6 +2658,8 @@ class Llava15ChatHandler:
|
|||
llama.n_batch,
|
||||
n_past_p,
|
||||
)
|
||||
# Required to avoid issues with hf tokenizer
|
||||
llama.input_ids[llama.n_tokens : n_past.value] = -1
|
||||
llama.n_tokens = n_past.value
|
||||
|
||||
# Get prompt tokens to avoid a cache miss
|
||||
|
@ -2723,6 +3049,7 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
|
|||
# Answer the question<|im_end|><|im_start|>user
|
||||
# <image>
|
||||
# What is the picture about?<|im_end|><|im_start|>assistant
|
||||
DEFAULT_SYSTEM_MESSAGE = "Answer the question"
|
||||
|
||||
CHAT_FORMAT = (
|
||||
"{% for message in messages %}"
|
||||
|
@ -2771,6 +3098,66 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
|
|||
"{% endif %}"
|
||||
)
|
||||
|
||||
class Llama3VisionAlpha(Llava15ChatHandler):
|
||||
# question = "<image>" + q
|
||||
|
||||
# prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
|
||||
DEFAULT_SYSTEM_MESSAGE = None
|
||||
|
||||
CHAT_FORMAT = (
|
||||
"{% for message in messages %}"
|
||||
|
||||
"<|start_header_id|>"
|
||||
|
||||
"{% if message.role == 'user' %}"
|
||||
|
||||
"user<|end_header_id|>\n\n"
|
||||
|
||||
"{% if message.content is iterable %}"
|
||||
|
||||
# <image>
|
||||
"{% for content in message.content %}"
|
||||
"{% if content.type == 'image_url' %}"
|
||||
"{% if content.image_url is string %}"
|
||||
"{{ content.image_url }}"
|
||||
"{% endif %}"
|
||||
"{% if content.image_url is mapping %}"
|
||||
"{{ content.image_url.url }}"
|
||||
"{% endif %}"
|
||||
"{% endif %}"
|
||||
"{% endfor %}"
|
||||
|
||||
# Question:
|
||||
"{% for content in message.content %}"
|
||||
"{% if content.type == 'text' %}"
|
||||
"{{ content.text }}"
|
||||
"{% endif %}"
|
||||
"{% endfor %}"
|
||||
|
||||
"{% endif %}"
|
||||
|
||||
# Question:
|
||||
"{% if message.content is string %}"
|
||||
"{{ message.content }}"
|
||||
"{% endif %}"
|
||||
|
||||
"{% endif %}"
|
||||
|
||||
# Answer:
|
||||
"{% if message.role == 'assistant' %}"
|
||||
"assistant<|end_header_id|>\n\n"
|
||||
"{{ message.content }}"
|
||||
"{% endif %}"
|
||||
|
||||
"<|eot_id|>"
|
||||
|
||||
"{% endfor %}"
|
||||
|
||||
# Generation prompt
|
||||
"{% if add_generation_prompt %}"
|
||||
"<|start_header_id|>assistant<|end_header_id|>\n\n"
|
||||
"{% endif %}"
|
||||
)
|
||||
|
||||
@register_chat_completion_handler("chatml-function-calling")
|
||||
def chatml_function_calling(
|
||||
|
@ -2858,8 +3245,7 @@ def chatml_function_calling(
|
|||
"{% endfor %}"
|
||||
"{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
|
||||
)
|
||||
template_renderer = jinja2.Environment(
|
||||
loader=jinja2.BaseLoader(),
|
||||
template_renderer = ImmutableSandboxedEnvironment(
|
||||
autoescape=jinja2.select_autoescape(["html", "xml"]),
|
||||
undefined=jinja2.StrictUndefined,
|
||||
).from_string(function_calling_template)
|
||||
|
|
|
@ -294,6 +294,11 @@ LLAMA_VOCAB_TYPE_WPM = 3
|
|||
# LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
||||
# LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||
# LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||
# LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||
# LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||
# LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
|
||||
# LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
|
||||
# LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
|
||||
# };
|
||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
|
||||
|
@ -303,6 +308,11 @@ LLAMA_VOCAB_PRE_TYPE_FALCON = 4
|
|||
LLAMA_VOCAB_PRE_TYPE_MPT = 5
|
||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
|
||||
LLAMA_VOCAB_PRE_TYPE_REFACT = 8
|
||||
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 11
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 12
|
||||
|
||||
|
||||
# // note: these values should be synchronized with ggml_rope
|
||||
|
@ -371,6 +381,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
|
|||
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||
|
||||
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
# };
|
||||
|
@ -403,6 +414,8 @@ LLAMA_FTYPE_MOSTLY_IQ3_M = 27
|
|||
LLAMA_FTYPE_MOSTLY_IQ2_S = 28
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31
|
||||
LLAMA_FTYPE_MOSTLY_BF16 = 32
|
||||
LLAMA_FTYPE_GUESSED = 1024
|
||||
|
||||
# enum llama_rope_scaling_type {
|
||||
|
@ -494,7 +507,7 @@ class llama_token_data_array(ctypes.Structure):
|
|||
|
||||
llama_token_data_array_p = ctypes.POINTER(llama_token_data_array)
|
||||
|
||||
# typedef bool (*llama_progress_callback)(float progress, void *ctx);
|
||||
# typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
||||
llama_progress_callback = ctypes.CFUNCTYPE(
|
||||
ctypes.c_bool, ctypes.c_float, ctypes.c_void_p
|
||||
)
|
||||
|
@ -635,6 +648,9 @@ class llama_model_kv_override(ctypes.Structure):
|
|||
# // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
# const float * tensor_split;
|
||||
|
||||
# // comma separated list of RPC servers to use for offloading
|
||||
# const char * rpc_servers;
|
||||
|
||||
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||
# // If the provided progress_callback returns true, model loading continues.
|
||||
# // If it returns false, model loading is immediately aborted.
|
||||
|
@ -661,6 +677,7 @@ class llama_model_params(ctypes.Structure):
|
|||
split_mode (int): how to split the model across multiple GPUs
|
||||
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
|
||||
tensor_split (ctypes.Array[ctypes.ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
rpc_servers (ctypes.c_char_p): comma separated list of RPC servers to use for offloading
|
||||
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
|
||||
progress_callback_user_data (ctypes.ctypes.c_void_p): context pointer passed to the progress callback
|
||||
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
|
||||
|
@ -674,6 +691,7 @@ class llama_model_params(ctypes.Structure):
|
|||
split_mode: int
|
||||
main_gpu: int
|
||||
tensor_split: CtypesArray[ctypes.c_float]
|
||||
rpc_servers: ctypes.c_char_p
|
||||
progress_callback: Callable[[float, ctypes.c_void_p], bool]
|
||||
progress_callback_user_data: ctypes.c_void_p
|
||||
kv_overrides: CtypesArray[llama_model_kv_override]
|
||||
|
@ -687,6 +705,7 @@ class llama_model_params(ctypes.Structure):
|
|||
("split_mode", ctypes.c_int),
|
||||
("main_gpu", ctypes.c_int32),
|
||||
("tensor_split", ctypes.POINTER(ctypes.c_float)),
|
||||
("rpc_servers", ctypes.c_char_p),
|
||||
("progress_callback", llama_progress_callback),
|
||||
("progress_callback_user_data", ctypes.c_void_p),
|
||||
("kv_overrides", ctypes.POINTER(llama_model_kv_override)),
|
||||
|
|
|
@ -132,6 +132,7 @@ def create_app(
|
|||
middleware=middleware,
|
||||
title="🦙 llama.cpp Python API",
|
||||
version=llama_cpp.__version__,
|
||||
root_path=server_settings.root_path,
|
||||
)
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
|
@ -274,6 +275,7 @@ async def create_completion(
|
|||
"best_of",
|
||||
"logit_bias_type",
|
||||
"user",
|
||||
"min_tokens",
|
||||
}
|
||||
kwargs = body.model_dump(exclude=exclude)
|
||||
|
||||
|
@ -287,6 +289,15 @@ async def create_completion(
|
|||
if body.grammar is not None:
|
||||
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
|
||||
|
||||
if body.min_tokens > 0:
|
||||
_min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
|
||||
[llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
|
||||
)
|
||||
if "logits_processor" not in kwargs:
|
||||
kwargs["logits_processor"] = _min_tokens_logits_processor
|
||||
else:
|
||||
kwargs["logits_processor"].extend(_min_tokens_logits_processor)
|
||||
|
||||
iterator_or_completion: Union[
|
||||
llama_cpp.CreateCompletionResponse,
|
||||
Iterator[llama_cpp.CreateCompletionStreamResponse],
|
||||
|
@ -444,6 +455,7 @@ async def create_chat_completion(
|
|||
"n",
|
||||
"logit_bias_type",
|
||||
"user",
|
||||
"min_tokens",
|
||||
}
|
||||
kwargs = body.model_dump(exclude=exclude)
|
||||
llama = llama_proxy(body.model)
|
||||
|
@ -457,6 +469,15 @@ async def create_chat_completion(
|
|||
if body.grammar is not None:
|
||||
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
|
||||
|
||||
if body.min_tokens > 0:
|
||||
_min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
|
||||
[llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
|
||||
)
|
||||
if "logits_processor" not in kwargs:
|
||||
kwargs["logits_processor"] = _min_tokens_logits_processor
|
||||
else:
|
||||
kwargs["logits_processor"].extend(_min_tokens_logits_processor)
|
||||
|
||||
iterator_or_completion: Union[
|
||||
llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
|
||||
] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
|
||||
|
|
|
@ -140,6 +140,20 @@ class LlamaProxy:
|
|||
chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
|
||||
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
||||
)
|
||||
elif settings.chat_format == "llama-3-vision-alpha":
|
||||
assert settings.clip_model_path is not None, "clip model not found"
|
||||
if settings.hf_model_repo_id is not None:
|
||||
chat_handler = (
|
||||
llama_cpp.llama_chat_format.Llama3VisionAlpha.from_pretrained(
|
||||
repo_id=settings.hf_model_repo_id,
|
||||
filename=settings.clip_model_path,
|
||||
verbose=settings.verbose,
|
||||
)
|
||||
)
|
||||
else:
|
||||
chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha(
|
||||
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
||||
)
|
||||
elif settings.chat_format == "hf-autotokenizer":
|
||||
assert (
|
||||
settings.hf_pretrained_model_name_or_path is not None
|
||||
|
@ -228,6 +242,7 @@ class LlamaProxy:
|
|||
logits_all=settings.logits_all,
|
||||
embedding=settings.embedding,
|
||||
offload_kqv=settings.offload_kqv,
|
||||
flash_attn=settings.flash_attn,
|
||||
# Sampling Params
|
||||
last_n_tokens_size=settings.last_n_tokens_size,
|
||||
# LoRA Params
|
||||
|
|
|
@ -215,6 +215,10 @@ class ServerSettings(BaseSettings):
|
|||
default=False,
|
||||
description="Disable EventSource pings (may be needed for some clients).",
|
||||
)
|
||||
root_path: str = Field(
|
||||
default="",
|
||||
description="The root path for the server. Useful when running behind a reverse proxy.",
|
||||
)
|
||||
|
||||
|
||||
class Settings(ServerSettings, ModelSettings):
|
||||
|
|
|
@ -16,10 +16,14 @@ max_tokens_field = Field(
|
|||
default=16, ge=1, description="The maximum number of tokens to generate."
|
||||
)
|
||||
|
||||
min_tokens_field = Field(
|
||||
default=0,
|
||||
ge=0,
|
||||
description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).",
|
||||
)
|
||||
|
||||
temperature_field = Field(
|
||||
default=0.8,
|
||||
ge=0.0,
|
||||
le=2.0,
|
||||
description="Adjust the randomness of the generated text.\n\n"
|
||||
+ "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
|
||||
)
|
||||
|
@ -113,6 +117,7 @@ class CreateCompletionRequest(BaseModel):
|
|||
max_tokens: Optional[int] = Field(
|
||||
default=16, ge=0, description="The maximum number of tokens to generate."
|
||||
)
|
||||
min_tokens: int = min_tokens_field
|
||||
temperature: float = temperature_field
|
||||
top_p: float = top_p_field
|
||||
min_p: float = min_p_field
|
||||
|
@ -208,6 +213,7 @@ class CreateChatCompletionRequest(BaseModel):
|
|||
default=None,
|
||||
description="The maximum number of tokens to generate. Defaults to inf",
|
||||
)
|
||||
min_tokens: int = min_tokens_field
|
||||
logprobs: Optional[bool] = Field(
|
||||
default=False,
|
||||
description="Whether to output the logprobs or not. Default is True"
|
||||
|
|
|
@ -44,7 +44,9 @@ releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern)
|
|||
# For each release, get all assets
|
||||
for release in $releases; do
|
||||
assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets)
|
||||
echo " <h2>$release</h2>" >> index.html
|
||||
# Get release version from release ie v0.1.0-cu121 -> v0.1.0
|
||||
release_version=$(echo $release | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
|
||||
echo " <h2>$release_version</h2>" >> index.html
|
||||
for asset in $(echo $assets | jq -r .[].browser_download_url); do
|
||||
if [[ $asset == *".whl" ]]; then
|
||||
echo " <a href=\"$asset\">$asset</a>" >> index.html
|
||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
|||
Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961
|
||||
Subproject commit 05834841dcb4f922983ea976539c70472272df9a
|
Loading…
Reference in a new issue