Compare commits
48 commits
1d177aaaef
...
5b4ad6f4d1
Author | SHA1 | Date | |
---|---|---|---|
5b4ad6f4d1 | |||
|
3dbfec74e7 | ||
|
d8a3b013c3 | ||
|
03f171e810 | ||
|
b564d05806 | ||
|
d99a6ba607 | ||
|
e811a81066 | ||
|
ca8e3c967d | ||
|
5212fb08ae | ||
|
389e09c2f5 | ||
|
4b54f79330 | ||
|
50f5c74ecf | ||
|
43ba1526c8 | ||
|
3f8e17af63 | ||
|
3c19faa0d4 | ||
|
3fe8e9a8f3 | ||
|
9dc5e20fb6 | ||
|
1547202b77 | ||
|
7f59856fa6 | ||
|
73165021bb | ||
|
eafb6ec5e8 | ||
|
ac55d0a175 | ||
|
4badac3a60 | ||
|
561e880654 | ||
|
b454f40a9a | ||
|
5ab40e6167 | ||
|
bf66a283e8 | ||
|
3757328b70 | ||
|
77122638b4 | ||
|
2a39b99575 | ||
|
9ce5cb376a | ||
|
4a7122d22f | ||
|
228949c1f7 | ||
|
903b28adf5 | ||
|
07966b9ba7 | ||
|
a50d24e3a7 | ||
|
0318702cdc | ||
|
3666833107 | ||
|
3e2597eac8 | ||
|
e0d7674e62 | ||
|
1f56c648c3 | ||
|
f9b7221c8f | ||
|
9f7a85571a | ||
|
0a454bebe6 | ||
|
2138561fab | ||
|
2117122396 | ||
|
d75dea18db | ||
|
31b1d95a6c |
20 changed files with 807 additions and 193 deletions
8
.github/dependabot.yml
vendored
8
.github/dependabot.yml
vendored
|
@ -8,8 +8,12 @@ updates:
|
||||||
- package-ecosystem: "pip" # See documentation for possible values
|
- package-ecosystem: "pip" # See documentation for possible values
|
||||||
directory: "/" # Location of package manifests
|
directory: "/" # Location of package manifests
|
||||||
schedule:
|
schedule:
|
||||||
interval: "weekly"
|
interval: "daily"
|
||||||
- package-ecosystem: "github-actions"
|
- package-ecosystem: "github-actions"
|
||||||
directory: "/"
|
directory: "/"
|
||||||
schedule:
|
schedule:
|
||||||
interval: "weekly"
|
interval: "daily"
|
||||||
|
- package-ecosystem: "docker"
|
||||||
|
directory: "/"
|
||||||
|
schedule:
|
||||||
|
interval: "daily"
|
||||||
|
|
4
.github/workflows/build-and-release.yaml
vendored
4
.github/workflows/build-and-release.yaml
vendored
|
@ -29,7 +29,7 @@ jobs:
|
||||||
python -m pip install -e .[all]
|
python -m pip install -e .[all]
|
||||||
|
|
||||||
- name: Build wheels
|
- name: Build wheels
|
||||||
uses: pypa/cibuildwheel@v2.17.0
|
uses: pypa/cibuildwheel@v2.18.0
|
||||||
env:
|
env:
|
||||||
# disable repair
|
# disable repair
|
||||||
CIBW_REPAIR_WHEEL_COMMAND: ""
|
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||||
|
@ -56,7 +56,7 @@ jobs:
|
||||||
platforms: linux/arm64
|
platforms: linux/arm64
|
||||||
|
|
||||||
- name: Build wheels
|
- name: Build wheels
|
||||||
uses: pypa/cibuildwheel@v2.17.0
|
uses: pypa/cibuildwheel@v2.18.0
|
||||||
env:
|
env:
|
||||||
CIBW_SKIP: "*musllinux* pp*"
|
CIBW_SKIP: "*musllinux* pp*"
|
||||||
CIBW_REPAIR_WHEEL_COMMAND: ""
|
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||||
|
|
54
CHANGELOG.md
54
CHANGELOG.md
|
@ -7,9 +7,61 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.2.75]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@13ad16af1231ab2d245d35df3295bcfa23de1305
|
||||||
|
- fix: segfault for models without eos / bos tokens by @abetlen in d99a6ba607a4885fb00e63e967964aa41bdbbbcb
|
||||||
|
- feat: add MinTokensLogitProcessor and min_tokens argument to server by @twaka in #1333
|
||||||
|
- misc: Remove unnecessary metadata lookups by @CISC in #1448
|
||||||
|
|
||||||
|
## [0.2.74]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@b228aba91ac2cd9eb90e9d423ba1d0d20e0117e2
|
||||||
|
- fix: Enable CUDA backend for llava by @abetlen in 7f59856fa6f3e23f07e12fc15aeb9359dc6c3bb4
|
||||||
|
- docs: Fix typo in README.md by @yupbank in #1444
|
||||||
|
|
||||||
|
## [0.2.73]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@25c6e82e7a1ad25a42b0894e87d9b5c557409516
|
||||||
|
- fix: Clear kv cache at beginning of image chat formats to avoid bug when image is evaluated first by @abetlen in ac55d0a175115d1e719672ce1cb1bec776c738b1
|
||||||
|
|
||||||
|
## [0.2.72]
|
||||||
|
|
||||||
|
- fix(security): Remote Code Execution by Server-Side Template Injection in Model Metadata by @retr0reg in b454f40a9a1787b2b5659cd2cb00819d983185df
|
||||||
|
- fix(security): Update remaining jinja chat templates to use immutable sandbox by @CISC in #1441
|
||||||
|
|
||||||
|
## [0.2.71]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@911b3900dded9a1cfe0f0e41b82c7a29baf3a217
|
||||||
|
- fix: Make leading bos_token optional for image chat formats, fix nanollava system message by @abetlen in 77122638b4153e31d9f277b3d905c2900b536632
|
||||||
|
- fix: free last image embed in llava chat handler by @abetlen in 3757328b703b2cd32dcbd5853271e3a8c8599fe7
|
||||||
|
|
||||||
|
## [0.2.70]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@c0e6fbf8c380718102bd25fcb8d2e55f8f9480d1
|
||||||
|
- feat: fill-in-middle support by @CISC in #1386
|
||||||
|
- fix: adding missing args in create_completion for functionary chat handler by @skalade in #1430
|
||||||
|
- docs: update README.md @eltociear in #1432
|
||||||
|
- fix: chat_format log where auto-detected format prints None by @balvisio in #1434
|
||||||
|
- feat(server): Add support for setting root_path by @abetlen in 0318702cdc860999ee70f277425edbbfe0e60419
|
||||||
|
- feat(ci): Add docker checks and check deps more frequently by @Smartappli in #1426
|
||||||
|
- fix: detokenization case where first token does not start with a leading space by @noamgat in #1375
|
||||||
|
- feat: Implement streaming for Functionary v2 + Bug fixes by @jeffrey-fong in #1419
|
||||||
|
- fix: Use memmove to copy str_value kv_override by @abetlen in 9f7a85571ae80d3b6ddbd3e1bae407b9f1e3448a
|
||||||
|
- feat(server): Remove temperature bounds checks for server by @abetlen in 0a454bebe67d12a446981eb16028c168ca5faa81
|
||||||
|
- fix(server): Propagate flash_attn to model load by @dthuerck in #1424
|
||||||
|
|
||||||
|
## [0.2.69]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
|
||||||
|
- feat: Add llama-3-vision-alpha chat format by @abetlen in 31b1d95a6c19f5b615a3286069f181a415f872e8
|
||||||
|
- fix: Change default verbose value of verbose in image chat format handlers to True to match Llama by @abetlen in 4f01c452b6c738dc56eacac3758119b12c57ea94
|
||||||
|
- fix: Suppress all logs when verbose=False, use hardcoded fileno's to work in colab notebooks by @abetlen in f116175a5a7c84569c88cad231855c1e6e59ff6e
|
||||||
|
- fix: UTF-8 handling with grammars by @jsoma in #1415
|
||||||
|
|
||||||
## [0.2.68]
|
## [0.2.68]
|
||||||
|
|
||||||
- feat: Update llama.cpp to ggerganov/llama.cpp@
|
- feat: Update llama.cpp to ggerganov/llama.cpp@77e15bec6217a39be59b9cc83d6b9afb6b0d8167
|
||||||
- feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
|
- feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
|
||||||
- fix(ci): Fix build-and-release.yaml by @Smartappli in #1413
|
- fix(ci): Fix build-and-release.yaml by @Smartappli in #1413
|
||||||
|
|
||||||
|
|
|
@ -51,8 +51,9 @@ if (LLAMA_BUILD)
|
||||||
)
|
)
|
||||||
|
|
||||||
if (LLAVA_BUILD)
|
if (LLAVA_BUILD)
|
||||||
if (LLAMA_CUBLAS)
|
if (LLAMA_CUBLAS OR LLAMA_CUDA)
|
||||||
add_compile_definitions(GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_CUBLAS)
|
||||||
|
add_compile_definitions(GGML_USE_CUDA)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -16,7 +16,7 @@ build.debug:
|
||||||
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false --editable .
|
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false --editable .
|
||||||
|
|
||||||
build.cuda:
|
build.cuda:
|
||||||
CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --verbose -e .
|
CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .
|
||||||
|
|
||||||
build.opencl:
|
build.opencl:
|
||||||
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
|
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
|
||||||
|
|
|
@ -516,7 +516,7 @@ chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
|
||||||
llm = Llama(
|
llm = Llama(
|
||||||
model_path="./path/to/llava/llama-model.gguf",
|
model_path="./path/to/llava/llama-model.gguf",
|
||||||
chat_handler=chat_handler,
|
chat_handler=chat_handler,
|
||||||
n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
|
n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
|
||||||
)
|
)
|
||||||
llm.create_chat_completion(
|
llm.create_chat_completion(
|
||||||
messages = [
|
messages = [
|
||||||
|
@ -547,10 +547,10 @@ llm = Llama.from_pretrained(
|
||||||
repo_id="vikhyatk/moondream2",
|
repo_id="vikhyatk/moondream2",
|
||||||
filename="*text-model*",
|
filename="*text-model*",
|
||||||
chat_handler=chat_handler,
|
chat_handler=chat_handler,
|
||||||
n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
|
n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
|
||||||
)
|
)
|
||||||
|
|
||||||
respoonse = llm.create_chat_completion(
|
response = llm.create_chat_completion(
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
|
19
examples/ray/README.md
Normal file
19
examples/ray/README.md
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html).
|
||||||
|
|
||||||
|
First, install the requirements:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Deploy a GGUF model to Ray Serve with the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf'
|
||||||
|
```
|
||||||
|
|
||||||
|
This will start an API endpoint at `http://localhost:8000/`. You can query the model like this:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000
|
||||||
|
```
|
20
examples/ray/llm.py
Executable file
20
examples/ray/llm.py
Executable file
|
@ -0,0 +1,20 @@
|
||||||
|
from starlette.requests import Request
|
||||||
|
from typing import Dict
|
||||||
|
from ray import serve
|
||||||
|
from ray.serve import Application
|
||||||
|
from llama_cpp import Llama
|
||||||
|
|
||||||
|
@serve.deployment
|
||||||
|
class LlamaDeployment:
|
||||||
|
def __init__(self, model_path: str):
|
||||||
|
self._llm = Llama(model_path=model_path)
|
||||||
|
|
||||||
|
async def __call__(self, http_request: Request) -> Dict:
|
||||||
|
input_json = await http_request.json()
|
||||||
|
prompt = input_json["prompt"]
|
||||||
|
max_tokens = input_json.get("max_tokens", 64)
|
||||||
|
return self._llm(prompt, max_tokens=max_tokens)
|
||||||
|
|
||||||
|
|
||||||
|
def llm_builder(args: Dict[str, str]) -> Application:
|
||||||
|
return LlamaDeployment.bind(args["model_path"])
|
3
examples/ray/requirements.txt
Normal file
3
examples/ray/requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
ray[serve]
|
||||||
|
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
||||||
|
llama-cpp-python
|
|
@ -1,4 +1,4 @@
|
||||||
from .llama_cpp import *
|
from .llama_cpp import *
|
||||||
from .llama import *
|
from .llama import *
|
||||||
|
|
||||||
__version__ = "0.2.68"
|
__version__ = "0.2.75"
|
|
@ -203,7 +203,7 @@ class _LlamaModel:
|
||||||
# NOTE: Llama1 models automatically added a space at the start of the prompt
|
# NOTE: Llama1 models automatically added a space at the start of the prompt
|
||||||
# this line removes a leading space if the first token is a beginning of sentence token
|
# this line removes a leading space if the first token is a beginning of sentence token
|
||||||
return (
|
return (
|
||||||
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
|
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b' ' else output
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extra
|
# Extra
|
||||||
|
|
|
@ -262,7 +262,12 @@ class Llama:
|
||||||
raise ValueError(f"Value for {k} is too long: {v}")
|
raise ValueError(f"Value for {k} is too long: {v}")
|
||||||
v_bytes = v_bytes.ljust(128, b"\0")
|
v_bytes = v_bytes.ljust(128, b"\0")
|
||||||
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
|
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
|
||||||
self._kv_overrides_array[i].value.str_value[:128] = v_bytes
|
# copy min(v_bytes, 128) to str_value
|
||||||
|
ctypes.memmove(
|
||||||
|
self._kv_overrides_array[i].value.str_value,
|
||||||
|
v_bytes,
|
||||||
|
min(len(v_bytes), 128),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown value type for {k}: {v}")
|
raise ValueError(f"Unknown value type for {k}: {v}")
|
||||||
|
|
||||||
|
@ -373,6 +378,7 @@ class Llama:
|
||||||
|
|
||||||
self.chat_format = chat_format
|
self.chat_format = chat_format
|
||||||
self.chat_handler = chat_handler
|
self.chat_handler = chat_handler
|
||||||
|
self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = {}
|
||||||
|
|
||||||
self.draft_model = draft_model
|
self.draft_model = draft_model
|
||||||
|
|
||||||
|
@ -404,10 +410,33 @@ class Llama:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"Model metadata: {self.metadata}", file=sys.stderr)
|
print(f"Model metadata: {self.metadata}", file=sys.stderr)
|
||||||
|
|
||||||
|
eos_token_id = self.token_eos()
|
||||||
|
bos_token_id = self.token_bos()
|
||||||
|
|
||||||
|
eos_token = self._model.token_get_text(eos_token_id) if eos_token_id != -1 else ""
|
||||||
|
bos_token = self._model.token_get_text(bos_token_id) if bos_token_id != -1 else ""
|
||||||
|
|
||||||
|
# Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates
|
||||||
|
template_choices = dict((name[10:], template) for name, template in self.metadata.items() if name.startswith("tokenizer.chat_template."))
|
||||||
|
|
||||||
|
if "tokenizer.chat_template" in self.metadata:
|
||||||
|
template_choices["chat_template.default"] = self.metadata["tokenizer.chat_template"]
|
||||||
|
|
||||||
|
if self.verbose and template_choices:
|
||||||
|
print(f"Available chat formats from metadata: {', '.join(template_choices.keys())}", file=sys.stderr)
|
||||||
|
|
||||||
|
for name, template in template_choices.items():
|
||||||
|
self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter(
|
||||||
|
template=template,
|
||||||
|
eos_token=eos_token,
|
||||||
|
bos_token=bos_token,
|
||||||
|
stop_token_ids=[eos_token_id],
|
||||||
|
).to_chat_handler()
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.chat_format is None
|
self.chat_format is None
|
||||||
and self.chat_handler is None
|
and self.chat_handler is None
|
||||||
and "tokenizer.chat_template" in self.metadata
|
and "chat_template.default" in template_choices
|
||||||
):
|
):
|
||||||
chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata(
|
chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata(
|
||||||
self.metadata
|
self.metadata
|
||||||
|
@ -418,35 +447,17 @@ class Llama:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"Guessed chat format: {chat_format}", file=sys.stderr)
|
print(f"Guessed chat format: {chat_format}", file=sys.stderr)
|
||||||
else:
|
else:
|
||||||
template = self.metadata["tokenizer.chat_template"]
|
|
||||||
try:
|
|
||||||
eos_token_id = int(self.metadata["tokenizer.ggml.eos_token_id"])
|
|
||||||
except:
|
|
||||||
eos_token_id = self.token_eos()
|
|
||||||
try:
|
|
||||||
bos_token_id = int(self.metadata["tokenizer.ggml.bos_token_id"])
|
|
||||||
except:
|
|
||||||
bos_token_id = self.token_bos()
|
|
||||||
|
|
||||||
eos_token = self._model.token_get_text(eos_token_id)
|
|
||||||
bos_token = self._model.token_get_text(bos_token_id)
|
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"Using gguf chat template: {template}", file=sys.stderr)
|
print(f"Using gguf chat template: {template_choices['chat_template.default']}", file=sys.stderr)
|
||||||
print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
|
print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
|
||||||
print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
|
print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
|
||||||
|
|
||||||
self.chat_handler = llama_chat_format.Jinja2ChatFormatter(
|
self.chat_format = "chat_template.default"
|
||||||
template=template,
|
|
||||||
eos_token=eos_token,
|
|
||||||
bos_token=bos_token,
|
|
||||||
stop_token_ids=[eos_token_id],
|
|
||||||
).to_chat_handler()
|
|
||||||
|
|
||||||
if self.chat_format is None and self.chat_handler is None:
|
if self.chat_format is None and self.chat_handler is None:
|
||||||
self.chat_format = "llama-2"
|
self.chat_format = "llama-2"
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"Using fallback chat format: {chat_format}", file=sys.stderr)
|
print(f"Using fallback chat format: {self.chat_format}", file=sys.stderr)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ctx(self) -> llama_cpp.llama_context_p:
|
def ctx(self) -> llama_cpp.llama_context_p:
|
||||||
|
@ -950,19 +961,54 @@ class Llama:
|
||||||
|
|
||||||
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
|
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
|
||||||
created: int = int(time.time())
|
created: int = int(time.time())
|
||||||
|
prefix_token_id: int = self._model.token_prefix()
|
||||||
|
middle_token_id: int = self._model.token_middle()
|
||||||
|
suffix_token_id: int = self._model.token_suffix()
|
||||||
# If prompt is empty, initialize completion with BOS token to avoid
|
# If prompt is empty, initialize completion with BOS token to avoid
|
||||||
# detokenization including a space at the beginning of the completion
|
# detokenization including a space at the beginning of the completion
|
||||||
completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
|
completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
|
||||||
# Add blank space to start of prompt to match OG llama tokenizer
|
# Add blank space to start of prompt to match OG llama tokenizer
|
||||||
prompt_tokens: List[int] = (
|
prompt_tokens: List[int] = (
|
||||||
(
|
(
|
||||||
self.tokenize(prompt.encode("utf-8"), special=True)
|
[prefix_token_id]
|
||||||
|
if prefix_token_id >= 0 and suffix is not None
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
+
|
||||||
|
(
|
||||||
|
(
|
||||||
|
self.tokenize(prompt.encode("utf-8"), add_bos=(prefix_token_id < 0 or suffix is None), special=(prefix_token_id < 0 or suffix is None))
|
||||||
if prompt != ""
|
if prompt != ""
|
||||||
|
else (
|
||||||
|
[]
|
||||||
|
if prefix_token_id >= 0 and suffix is not None
|
||||||
else [self.token_bos()]
|
else [self.token_bos()]
|
||||||
)
|
)
|
||||||
|
)
|
||||||
if isinstance(prompt, str)
|
if isinstance(prompt, str)
|
||||||
else prompt
|
else prompt
|
||||||
)
|
)
|
||||||
|
+
|
||||||
|
(
|
||||||
|
(
|
||||||
|
[suffix_token_id]
|
||||||
|
+
|
||||||
|
(
|
||||||
|
self.tokenize(suffix.encode("utf-8"), add_bos=False, special=False)
|
||||||
|
if suffix
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if suffix_token_id >= 0 and suffix is not None
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
+
|
||||||
|
(
|
||||||
|
[middle_token_id]
|
||||||
|
if middle_token_id >= 0 and suffix is not None
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
)
|
||||||
text: bytes = b""
|
text: bytes = b""
|
||||||
returned_tokens: int = 0
|
returned_tokens: int = 0
|
||||||
stop = (
|
stop = (
|
||||||
|
@ -1341,7 +1387,7 @@ class Llama:
|
||||||
if echo:
|
if echo:
|
||||||
text_str = prompt + text_str
|
text_str = prompt + text_str
|
||||||
|
|
||||||
if suffix is not None:
|
if suffix_token_id < 0 and suffix is not None:
|
||||||
text_str = text_str + suffix
|
text_str = text_str + suffix
|
||||||
|
|
||||||
logprobs_or_none: Optional[CompletionLogprobs] = None
|
logprobs_or_none: Optional[CompletionLogprobs] = None
|
||||||
|
@ -1679,7 +1725,7 @@ class Llama:
|
||||||
Returns:
|
Returns:
|
||||||
Generated chat completion or a stream of chat completion chunks.
|
Generated chat completion or a stream of chat completion chunks.
|
||||||
"""
|
"""
|
||||||
handler = self.chat_handler or llama_chat_format.get_chat_completion_handler(
|
handler = self.chat_handler or self._chat_handlers.get(self.chat_format) or llama_chat_format.get_chat_completion_handler(
|
||||||
self.chat_format
|
self.chat_format
|
||||||
)
|
)
|
||||||
return handler(
|
return handler(
|
||||||
|
@ -2038,3 +2084,19 @@ class StoppingCriteriaList(List[StoppingCriteria]):
|
||||||
self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
|
self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
|
||||||
) -> bool:
|
) -> bool:
|
||||||
return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
|
return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
|
||||||
|
|
||||||
|
|
||||||
|
class MinTokensLogitsProcessor(LogitsProcessor):
|
||||||
|
def __init__(self, min_tokens: int, token_eos: int):
|
||||||
|
self.min_tokens = min_tokens
|
||||||
|
self.token_eos = token_eos
|
||||||
|
self.prompt_tokens = None
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
|
||||||
|
) -> npt.NDArray[np.single]:
|
||||||
|
if self.prompt_tokens is None:
|
||||||
|
self.prompt_tokens = len(input_ids)
|
||||||
|
if len(input_ids) - self.prompt_tokens < self.min_tokens:
|
||||||
|
scores[self.token_eos] = -np.inf
|
||||||
|
return scores
|
||||||
|
|
|
@ -11,6 +11,7 @@ from contextlib import ExitStack
|
||||||
from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol, cast
|
from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol, cast
|
||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
|
from jinja2.sandbox import ImmutableSandboxedEnvironment
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
|
@ -191,7 +192,7 @@ class Jinja2ChatFormatter(ChatFormatter):
|
||||||
self.add_generation_prompt = add_generation_prompt
|
self.add_generation_prompt = add_generation_prompt
|
||||||
self.stop_token_ids = set(stop_token_ids) if stop_token_ids is not None else None
|
self.stop_token_ids = set(stop_token_ids) if stop_token_ids is not None else None
|
||||||
|
|
||||||
self._environment = jinja2.Environment(
|
self._environment = ImmutableSandboxedEnvironment(
|
||||||
loader=jinja2.BaseLoader(),
|
loader=jinja2.BaseLoader(),
|
||||||
trim_blocks=True,
|
trim_blocks=True,
|
||||||
lstrip_blocks=True,
|
lstrip_blocks=True,
|
||||||
|
@ -684,8 +685,7 @@ def hf_tokenizer_config_to_chat_formatter(
|
||||||
assert isinstance(tokenizer_config["eos_token"], str)
|
assert isinstance(tokenizer_config["eos_token"], str)
|
||||||
eos_token = tokenizer_config["eos_token"]
|
eos_token = tokenizer_config["eos_token"]
|
||||||
|
|
||||||
env = jinja2.Environment(
|
env = ImmutableSandboxedEnvironment(
|
||||||
loader=jinja2.BaseLoader(),
|
|
||||||
trim_blocks=True,
|
trim_blocks=True,
|
||||||
lstrip_blocks=True,
|
lstrip_blocks=True,
|
||||||
).from_string(chat_template)
|
).from_string(chat_template)
|
||||||
|
@ -1894,6 +1894,8 @@ def functionary_v1_v2_chat_handler(
|
||||||
function_call = (
|
function_call = (
|
||||||
tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
|
tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
|
||||||
)
|
)
|
||||||
|
elif function_call is not None:
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
function_call = "auto"
|
function_call = "auto"
|
||||||
|
|
||||||
|
@ -1930,11 +1932,10 @@ def functionary_v1_v2_chat_handler(
|
||||||
logits_processor=logits_processor,
|
logits_processor=logits_processor,
|
||||||
grammar=grammar,
|
grammar=grammar,
|
||||||
)
|
)
|
||||||
|
if stream is False:
|
||||||
completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
|
completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
|
||||||
return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore
|
return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore
|
||||||
|
|
||||||
assert stream is False # TODO: support stream mode
|
|
||||||
|
|
||||||
def get_grammar(function_call):
|
def get_grammar(function_call):
|
||||||
function_body = None
|
function_body = None
|
||||||
for function in functions or []:
|
for function in functions or []:
|
||||||
|
@ -1968,7 +1969,7 @@ def functionary_v1_v2_chat_handler(
|
||||||
|
|
||||||
return grammar
|
return grammar
|
||||||
|
|
||||||
def create_completion(stop):
|
def create_completion(prompt, stop, grammar):
|
||||||
completion = cast(llama_types.Completion, llama.create_completion(
|
completion = cast(llama_types.Completion, llama.create_completion(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
|
@ -1976,7 +1977,7 @@ def functionary_v1_v2_chat_handler(
|
||||||
top_k=top_k,
|
top_k=top_k,
|
||||||
min_p=min_p,
|
min_p=min_p,
|
||||||
typical_p=typical_p,
|
typical_p=typical_p,
|
||||||
stream=False,
|
stream=stream,
|
||||||
stop=stop,
|
stop=stop,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
presence_penalty=presence_penalty,
|
presence_penalty=presence_penalty,
|
||||||
|
@ -1997,6 +1998,315 @@ def functionary_v1_v2_chat_handler(
|
||||||
function_calls, function_bodies = [], []
|
function_calls, function_bodies = [], []
|
||||||
completion_tokens = 0
|
completion_tokens = 0
|
||||||
|
|
||||||
|
def generate_streaming(tools, functions, function_call, prompt):
|
||||||
|
assert version == "v2", "Streaming for v1 is not supported"
|
||||||
|
|
||||||
|
chunk_id, chunk_created = None, None
|
||||||
|
|
||||||
|
# If tool_choice/function_call is provided
|
||||||
|
if isinstance(function_call, dict):
|
||||||
|
prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
|
||||||
|
grammar = get_grammar(function_call["name"])
|
||||||
|
stops = [STOP_TOKEN, FROM_TOKEN]
|
||||||
|
tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
|
||||||
|
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||||
|
completion_text = ""
|
||||||
|
first = True
|
||||||
|
for chunk in completion:
|
||||||
|
# Yield the tool/function name first
|
||||||
|
if first:
|
||||||
|
if tools is not None:
|
||||||
|
func_call_dict = {
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"id": "call_" + tool_id,
|
||||||
|
"type": "function",
|
||||||
|
"function": {"name": function_call["name"], "arguments": ""},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
func_call_dict = {"function_call": {"name": function_call["name"], "arguments": ""}}
|
||||||
|
yield llama_types.CreateChatCompletionStreamResponse(
|
||||||
|
id="chat" + chunk["id"],
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
created=chunk["created"],
|
||||||
|
model=chunk["model"],
|
||||||
|
choices=[
|
||||||
|
{"index": 0, "logprobs": None, "delta": {"role": None, "content": None, **func_call_dict}}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
first = False
|
||||||
|
if tools is not None:
|
||||||
|
func_call_dict = {
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"id": "call_" + tool_id,
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": None,
|
||||||
|
"arguments": chunk["choices"][0]["text"].rstrip(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
|
||||||
|
if len(chunk["choices"][0]["text"].rstrip()) > 0:
|
||||||
|
yield llama_types.CreateChatCompletionStreamResponse(
|
||||||
|
id="chat" + chunk["id"],
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
created=chunk["created"],
|
||||||
|
model=chunk["model"],
|
||||||
|
choices=[
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": chunk["choices"][0]["logprobs"],
|
||||||
|
"delta": {
|
||||||
|
"role": None,
|
||||||
|
"content": None,
|
||||||
|
**func_call_dict,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# Yield tool_call/function_call stop message
|
||||||
|
yield llama_types.CreateChatCompletionStreamResponse(
|
||||||
|
id="chat" + chunk["id"],
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
created=chunk["created"],
|
||||||
|
model=chunk["model"],
|
||||||
|
choices=[
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"finish_reason": "tool_calls" if tools is not None else "function_call",
|
||||||
|
"logprobs": None,
|
||||||
|
"delta": {
|
||||||
|
"role": None, "content": None, "function_call": None, "tool_calls": None
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# If "auto" or no tool_choice/function_call
|
||||||
|
elif isinstance(function_call, str) and function_call == "auto":
|
||||||
|
tool_index = 0
|
||||||
|
while True:
|
||||||
|
# Generate function name first
|
||||||
|
grammar = None
|
||||||
|
stops = CONTENT_TOKEN
|
||||||
|
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||||
|
completion_text = ""
|
||||||
|
for chunk in completion:
|
||||||
|
completion_text += chunk["choices"][0]["text"]
|
||||||
|
if chunk_id is None:
|
||||||
|
chunk_id = chunk["id"]
|
||||||
|
if chunk_created is None:
|
||||||
|
chunk_created = chunk["created"]
|
||||||
|
function_name = completion_text.strip()
|
||||||
|
if function_name == "all":
|
||||||
|
prompt += "all\n<|content|>"
|
||||||
|
# Yield the first empty message for content
|
||||||
|
yield llama_types.CreateChatCompletionStreamResponse(
|
||||||
|
id="chat" + chunk_id,
|
||||||
|
model=chunk["model"],
|
||||||
|
created=chunk_created,
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
choices=[
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"delta": {"role": "assistant", "content": ""},
|
||||||
|
"logprobs": None,
|
||||||
|
"finish_reason": None,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prompt += f"{function_name}\n<|content|>"
|
||||||
|
grammar = get_grammar(function_name)
|
||||||
|
tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
|
||||||
|
if tools is not None:
|
||||||
|
func_call_dict = {
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"index": tool_index,
|
||||||
|
"id": "call_" + tool_id,
|
||||||
|
"type": "function",
|
||||||
|
"function": {"name": function_name, "arguments": ""},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
func_call_dict = {"function_call": {"name": function_name, "arguments": ""}}
|
||||||
|
# Stream function name
|
||||||
|
yield llama_types.CreateChatCompletionStreamResponse(
|
||||||
|
id="chat" + chunk_id,
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
created=chunk_created,
|
||||||
|
model=chunk["model"],
|
||||||
|
choices=[
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": chunk["choices"][0]["logprobs"],
|
||||||
|
"delta": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": None,
|
||||||
|
**func_call_dict,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# Generate content
|
||||||
|
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
|
||||||
|
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||||
|
if function_name == "all":
|
||||||
|
completion_text = ""
|
||||||
|
stop_sequence, buffer, is_end = "\n<|from|>assistant\n<|recipient|>", [], False
|
||||||
|
for i, chunk in enumerate(completion):
|
||||||
|
completion_text += chunk["choices"][0]["text"]
|
||||||
|
if is_end:
|
||||||
|
buffer.append(chunk["choices"][0]["text"].strip(" "))
|
||||||
|
if stop_sequence.startswith("".join(buffer)):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
buffer.pop()
|
||||||
|
while len(buffer) > 0:
|
||||||
|
yield llama_types.CreateChatCompletionStreamResponse(
|
||||||
|
id="chat" + chunk_id,
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
created=chunk_created,
|
||||||
|
model=chunk["model"],
|
||||||
|
choices=[
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": chunk["choices"][0]["logprobs"],
|
||||||
|
"delta": {
|
||||||
|
"role": "assistant", "content": buffer.pop(0)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
is_end = False
|
||||||
|
elif chunk["choices"][0]["text"] == "\n":
|
||||||
|
is_end = True
|
||||||
|
buffer.append(chunk["choices"][0]["text"].strip(" "))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(buffer) == 0 and len(chunk["choices"][0]["text"]) > 0:
|
||||||
|
yield llama_types.CreateChatCompletionStreamResponse(
|
||||||
|
id="chat" + chunk_id,
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
created=chunk_created,
|
||||||
|
model=chunk["model"],
|
||||||
|
choices=[
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": chunk["choices"][0]["logprobs"],
|
||||||
|
"delta": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": chunk["choices"][0]["text"] if i > 0 else chunk["choices"][0]["text"].lstrip()
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# Check whether the model wants to generate another turn
|
||||||
|
if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
|
||||||
|
if completion_text.endswith("\n<|from|>assistant\n"):
|
||||||
|
cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
|
||||||
|
elif completion_text.endswith("\n<|from|> assistant\n"):
|
||||||
|
cleaned_completion_text = completion_text[:-len("\n<|from|> assistant\n")].strip()
|
||||||
|
else:
|
||||||
|
cleaned_completion_text = completion_text.strip()
|
||||||
|
prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
|
||||||
|
else:
|
||||||
|
# Yield stop message
|
||||||
|
yield llama_types.CreateChatCompletionStreamResponse(
|
||||||
|
id="chat" + chunk_id,
|
||||||
|
model=chunk["model"],
|
||||||
|
created=chunk_created,
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
choices=[
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"delta": {},
|
||||||
|
"logprobs": None,
|
||||||
|
"finish_reason": "stop",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Check whether the model wants to generate another turn
|
||||||
|
completion_text = ""
|
||||||
|
for chunk in completion:
|
||||||
|
completion_text += chunk["choices"][0]["text"]
|
||||||
|
if len(chunk["choices"][0]["text"].rstrip()) > 0:
|
||||||
|
if tools is not None:
|
||||||
|
func_call_dict = {
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"index": tool_index,
|
||||||
|
"id": "call_" + tool_id,
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": None,
|
||||||
|
"arguments": chunk["choices"][0]["text"].rstrip(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
|
||||||
|
yield llama_types.CreateChatCompletionStreamResponse(
|
||||||
|
id="chat" + chunk_id,
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
created=chunk_created,
|
||||||
|
model=chunk["model"],
|
||||||
|
choices=[
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": chunk["choices"][0]["logprobs"],
|
||||||
|
"delta": {
|
||||||
|
"role": None,
|
||||||
|
"content": None,
|
||||||
|
**func_call_dict,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
prompt += completion_text.strip()
|
||||||
|
grammar = None
|
||||||
|
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||||
|
completion_text += "".join([chunk["choices"][0]["text"] for chunk in completion])
|
||||||
|
if ("<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text) and tools is not None:
|
||||||
|
prompt += "\n<|from|>assistant\n<|recipient|>"
|
||||||
|
tool_index += 1
|
||||||
|
else:
|
||||||
|
# Yield tool_call/function_call stop message
|
||||||
|
yield llama_types.CreateChatCompletionStreamResponse(
|
||||||
|
id="chat" + chunk_id,
|
||||||
|
object="chat.completion.chunk",
|
||||||
|
created=chunk_created,
|
||||||
|
model=chunk["model"],
|
||||||
|
choices=[
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"finish_reason": "tool_calls" if tools is not None else "function_call",
|
||||||
|
"logprobs": None,
|
||||||
|
"delta": {
|
||||||
|
"role": None, "content": None, "function_call": None, "tool_calls": None
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
if stream is not False:
|
||||||
|
return generate_streaming(
|
||||||
|
tools=tools, functions=functions, function_call=function_call, prompt=prompt
|
||||||
|
)
|
||||||
|
else:
|
||||||
if version == "v1":
|
if version == "v1":
|
||||||
# If no or "auto" tool_choice/function_call
|
# If no or "auto" tool_choice/function_call
|
||||||
if isinstance(function_call, str) and function_call == "auto":
|
if isinstance(function_call, str) and function_call == "auto":
|
||||||
|
@ -2012,7 +2322,7 @@ def functionary_v1_v2_chat_handler(
|
||||||
prompt = prompt
|
prompt = prompt
|
||||||
stops = ["\n", END_ASSISTANT_TOKEN]
|
stops = ["\n", END_ASSISTANT_TOKEN]
|
||||||
|
|
||||||
completion = create_completion(stop=stops)
|
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||||
completion_text = completion["choices"][0]["text"]
|
completion_text = completion["choices"][0]["text"]
|
||||||
completion_tokens += completion["usage"]["completion_tokens"]
|
completion_tokens += completion["usage"]["completion_tokens"]
|
||||||
|
|
||||||
|
@ -2039,7 +2349,7 @@ def functionary_v1_v2_chat_handler(
|
||||||
completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
|
completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
|
||||||
)
|
)
|
||||||
grammar = get_grammar(function_calls[-1])
|
grammar = get_grammar(function_calls[-1])
|
||||||
completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
|
completion = create_completion(prompt=prompt, stop=END_FUNCTION_CALL_TOKEN, grammar=grammar)
|
||||||
completion_tokens += completion["usage"]["completion_tokens"]
|
completion_tokens += completion["usage"]["completion_tokens"]
|
||||||
function_bodies.append(completion["choices"][0]["text"].strip())
|
function_bodies.append(completion["choices"][0]["text"].strip())
|
||||||
# If the prompt involves a function call, just append generated parameters to function_bodies
|
# If the prompt involves a function call, just append generated parameters to function_bodies
|
||||||
|
@ -2053,7 +2363,7 @@ def functionary_v1_v2_chat_handler(
|
||||||
function_calls.append(function_call)
|
function_calls.append(function_call)
|
||||||
grammar = get_grammar(function_call)
|
grammar = get_grammar(function_call)
|
||||||
stops = [STOP_TOKEN, FROM_TOKEN]
|
stops = [STOP_TOKEN, FROM_TOKEN]
|
||||||
completion = create_completion(stop=stops)
|
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||||
completion_text = completion["choices"][0]["text"]
|
completion_text = completion["choices"][0]["text"]
|
||||||
completion_tokens += completion["usage"]["completion_tokens"]
|
completion_tokens += completion["usage"]["completion_tokens"]
|
||||||
function_bodies.append(completion_text.strip())
|
function_bodies.append(completion_text.strip())
|
||||||
|
@ -2063,7 +2373,7 @@ def functionary_v1_v2_chat_handler(
|
||||||
# Generate function name first
|
# Generate function name first
|
||||||
grammar = None
|
grammar = None
|
||||||
stops = CONTENT_TOKEN
|
stops = CONTENT_TOKEN
|
||||||
completion = create_completion(stop=stops)
|
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||||
completion_text = completion["choices"][0]["text"]
|
completion_text = completion["choices"][0]["text"]
|
||||||
completion_tokens += completion["usage"]["completion_tokens"]
|
completion_tokens += completion["usage"]["completion_tokens"]
|
||||||
function_name = completion_text.strip()
|
function_name = completion_text.strip()
|
||||||
|
@ -2076,7 +2386,7 @@ def functionary_v1_v2_chat_handler(
|
||||||
grammar = get_grammar(function_call)
|
grammar = get_grammar(function_call)
|
||||||
# Generate content
|
# Generate content
|
||||||
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
|
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
|
||||||
completion = create_completion(stop=stops)
|
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||||
completion_text = completion["choices"][0]["text"]
|
completion_text = completion["choices"][0]["text"]
|
||||||
completion_tokens += completion["usage"]["completion_tokens"]
|
completion_tokens += completion["usage"]["completion_tokens"]
|
||||||
if function_name == "all":
|
if function_name == "all":
|
||||||
|
@ -2103,7 +2413,7 @@ def functionary_v1_v2_chat_handler(
|
||||||
# Check whether the model wants to generate another turn
|
# Check whether the model wants to generate another turn
|
||||||
prompt += completion_text.strip()
|
prompt += completion_text.strip()
|
||||||
grammar = None
|
grammar = None
|
||||||
completion = create_completion(stop=stops)
|
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
|
||||||
completion_tokens += completion["usage"]["completion_tokens"]
|
completion_tokens += completion["usage"]["completion_tokens"]
|
||||||
if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
|
if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
|
||||||
prompt += "\n<|from|>assistant\n<|recipient|>"
|
prompt += "\n<|from|>assistant\n<|recipient|>"
|
||||||
|
@ -2165,7 +2475,7 @@ def functionary_v1_v2_chat_handler(
|
||||||
|
|
||||||
|
|
||||||
class Llava15ChatHandler:
|
class Llava15ChatHandler:
|
||||||
DEFAULT_SYSTEM_MESSAGE = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
|
DEFAULT_SYSTEM_MESSAGE: Optional[str] = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
|
||||||
|
|
||||||
CHAT_FORMAT = (
|
CHAT_FORMAT = (
|
||||||
"{% for message in messages %}"
|
"{% for message in messages %}"
|
||||||
|
@ -2288,18 +2598,31 @@ class Llava15ChatHandler:
|
||||||
assert self.clip_ctx is not None
|
assert self.clip_ctx is not None
|
||||||
|
|
||||||
system_prompt = _get_system_message(messages)
|
system_prompt = _get_system_message(messages)
|
||||||
if system_prompt == "":
|
if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
|
||||||
messages = [llama_types.ChatCompletionRequestSystemMessage(role="system", content=self.DEFAULT_SYSTEM_MESSAGE)] + messages
|
messages = [llama_types.ChatCompletionRequestSystemMessage(role="system", content=self.DEFAULT_SYSTEM_MESSAGE)] + messages
|
||||||
|
|
||||||
image_urls = self.get_image_urls(messages)
|
image_urls = self.get_image_urls(messages)
|
||||||
template = jinja2.Template(self.CHAT_FORMAT)
|
template = ImmutableSandboxedEnvironment(
|
||||||
text = template.render(messages=messages, add_generation_prompt=True)
|
trim_blocks=True,
|
||||||
|
lstrip_blocks=True,
|
||||||
|
).from_string(self.CHAT_FORMAT)
|
||||||
|
text = template.render(
|
||||||
|
messages=messages,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
eos_token=llama.detokenize([llama.token_eos()]),
|
||||||
|
bos_token=llama.detokenize([llama.token_bos()]),
|
||||||
|
)
|
||||||
split_text = self.split_text_on_image_urls(text, image_urls)
|
split_text = self.split_text_on_image_urls(text, image_urls)
|
||||||
|
|
||||||
def embed_image_bytes(image_bytes: bytes):
|
def embed_image_bytes(image_bytes: bytes):
|
||||||
if self._last_image_embed is not None and self._last_image_hash is not None and hash(image_bytes) == self._last_image_hash:
|
if self._last_image_embed is not None and self._last_image_hash is not None and hash(image_bytes) == self._last_image_hash:
|
||||||
return self._last_image_embed
|
return self._last_image_embed
|
||||||
with suppress_stdout_stderr(disable=self.verbose):
|
with suppress_stdout_stderr(disable=self.verbose):
|
||||||
|
# Free the previous image embed
|
||||||
|
if self._last_image_embed is not None:
|
||||||
|
self._llava_cpp.llava_image_embed_free(self._last_image_embed)
|
||||||
|
self._last_image_embed = None
|
||||||
|
self._last_image_hash = None
|
||||||
embed = (
|
embed = (
|
||||||
self._llava_cpp.llava_image_embed_make_with_bytes(
|
self._llava_cpp.llava_image_embed_make_with_bytes(
|
||||||
self.clip_ctx,
|
self.clip_ctx,
|
||||||
|
@ -2314,9 +2637,10 @@ class Llava15ChatHandler:
|
||||||
|
|
||||||
# Evaluate prompt
|
# Evaluate prompt
|
||||||
llama.reset()
|
llama.reset()
|
||||||
for i, (type_, value) in enumerate(split_text):
|
llama._ctx.kv_cache_clear()
|
||||||
|
for type_, value in split_text:
|
||||||
if type_ == "text":
|
if type_ == "text":
|
||||||
tokens = llama.tokenize(value.encode("utf8"), add_bos=i == 0)
|
tokens = llama.tokenize(value.encode("utf8"), add_bos=False, special=True)
|
||||||
if llama.n_tokens + len(tokens) > llama.n_ctx():
|
if llama.n_tokens + len(tokens) > llama.n_ctx():
|
||||||
raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
|
raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
|
||||||
llama.eval(tokens)
|
llama.eval(tokens)
|
||||||
|
@ -2334,6 +2658,8 @@ class Llava15ChatHandler:
|
||||||
llama.n_batch,
|
llama.n_batch,
|
||||||
n_past_p,
|
n_past_p,
|
||||||
)
|
)
|
||||||
|
# Required to avoid issues with hf tokenizer
|
||||||
|
llama.input_ids[llama.n_tokens : n_past.value] = -1
|
||||||
llama.n_tokens = n_past.value
|
llama.n_tokens = n_past.value
|
||||||
|
|
||||||
# Get prompt tokens to avoid a cache miss
|
# Get prompt tokens to avoid a cache miss
|
||||||
|
@ -2723,6 +3049,7 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
|
||||||
# Answer the question<|im_end|><|im_start|>user
|
# Answer the question<|im_end|><|im_start|>user
|
||||||
# <image>
|
# <image>
|
||||||
# What is the picture about?<|im_end|><|im_start|>assistant
|
# What is the picture about?<|im_end|><|im_start|>assistant
|
||||||
|
DEFAULT_SYSTEM_MESSAGE = "Answer the question"
|
||||||
|
|
||||||
CHAT_FORMAT = (
|
CHAT_FORMAT = (
|
||||||
"{% for message in messages %}"
|
"{% for message in messages %}"
|
||||||
|
@ -2771,6 +3098,66 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
|
||||||
"{% endif %}"
|
"{% endif %}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
class Llama3VisionAlpha(Llava15ChatHandler):
|
||||||
|
# question = "<image>" + q
|
||||||
|
|
||||||
|
# prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
|
||||||
|
DEFAULT_SYSTEM_MESSAGE = None
|
||||||
|
|
||||||
|
CHAT_FORMAT = (
|
||||||
|
"{% for message in messages %}"
|
||||||
|
|
||||||
|
"<|start_header_id|>"
|
||||||
|
|
||||||
|
"{% if message.role == 'user' %}"
|
||||||
|
|
||||||
|
"user<|end_header_id|>\n\n"
|
||||||
|
|
||||||
|
"{% if message.content is iterable %}"
|
||||||
|
|
||||||
|
# <image>
|
||||||
|
"{% for content in message.content %}"
|
||||||
|
"{% if content.type == 'image_url' %}"
|
||||||
|
"{% if content.image_url is string %}"
|
||||||
|
"{{ content.image_url }}"
|
||||||
|
"{% endif %}"
|
||||||
|
"{% if content.image_url is mapping %}"
|
||||||
|
"{{ content.image_url.url }}"
|
||||||
|
"{% endif %}"
|
||||||
|
"{% endif %}"
|
||||||
|
"{% endfor %}"
|
||||||
|
|
||||||
|
# Question:
|
||||||
|
"{% for content in message.content %}"
|
||||||
|
"{% if content.type == 'text' %}"
|
||||||
|
"{{ content.text }}"
|
||||||
|
"{% endif %}"
|
||||||
|
"{% endfor %}"
|
||||||
|
|
||||||
|
"{% endif %}"
|
||||||
|
|
||||||
|
# Question:
|
||||||
|
"{% if message.content is string %}"
|
||||||
|
"{{ message.content }}"
|
||||||
|
"{% endif %}"
|
||||||
|
|
||||||
|
"{% endif %}"
|
||||||
|
|
||||||
|
# Answer:
|
||||||
|
"{% if message.role == 'assistant' %}"
|
||||||
|
"assistant<|end_header_id|>\n\n"
|
||||||
|
"{{ message.content }}"
|
||||||
|
"{% endif %}"
|
||||||
|
|
||||||
|
"<|eot_id|>"
|
||||||
|
|
||||||
|
"{% endfor %}"
|
||||||
|
|
||||||
|
# Generation prompt
|
||||||
|
"{% if add_generation_prompt %}"
|
||||||
|
"<|start_header_id|>assistant<|end_header_id|>\n\n"
|
||||||
|
"{% endif %}"
|
||||||
|
)
|
||||||
|
|
||||||
@register_chat_completion_handler("chatml-function-calling")
|
@register_chat_completion_handler("chatml-function-calling")
|
||||||
def chatml_function_calling(
|
def chatml_function_calling(
|
||||||
|
@ -2858,8 +3245,7 @@ def chatml_function_calling(
|
||||||
"{% endfor %}"
|
"{% endfor %}"
|
||||||
"{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
|
"{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
|
||||||
)
|
)
|
||||||
template_renderer = jinja2.Environment(
|
template_renderer = ImmutableSandboxedEnvironment(
|
||||||
loader=jinja2.BaseLoader(),
|
|
||||||
autoescape=jinja2.select_autoescape(["html", "xml"]),
|
autoescape=jinja2.select_autoescape(["html", "xml"]),
|
||||||
undefined=jinja2.StrictUndefined,
|
undefined=jinja2.StrictUndefined,
|
||||||
).from_string(function_calling_template)
|
).from_string(function_calling_template)
|
||||||
|
|
|
@ -294,6 +294,11 @@ LLAMA_VOCAB_TYPE_WPM = 3
|
||||||
# LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
# LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
||||||
# LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
# LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||||
# LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
# LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||||
|
# LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||||
|
# LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||||
|
# LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
|
||||||
|
# LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
|
||||||
|
# LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
|
||||||
# };
|
# };
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
|
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
|
||||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
|
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
|
||||||
|
@ -303,6 +308,11 @@ LLAMA_VOCAB_PRE_TYPE_FALCON = 4
|
||||||
LLAMA_VOCAB_PRE_TYPE_MPT = 5
|
LLAMA_VOCAB_PRE_TYPE_MPT = 5
|
||||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
|
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
|
||||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
|
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_REFACT = 8
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_OLMO = 11
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_DBRX = 12
|
||||||
|
|
||||||
|
|
||||||
# // note: these values should be synchronized with ggml_rope
|
# // note: these values should be synchronized with ggml_rope
|
||||||
|
@ -371,6 +381,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
|
||||||
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||||
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||||
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||||
|
# LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||||
|
|
||||||
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
# };
|
# };
|
||||||
|
@ -403,6 +414,8 @@ LLAMA_FTYPE_MOSTLY_IQ3_M = 27
|
||||||
LLAMA_FTYPE_MOSTLY_IQ2_S = 28
|
LLAMA_FTYPE_MOSTLY_IQ2_S = 28
|
||||||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29
|
LLAMA_FTYPE_MOSTLY_IQ2_M = 29
|
||||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30
|
||||||
|
LLAMA_FTYPE_MOSTLY_IQ1_M = 31
|
||||||
|
LLAMA_FTYPE_MOSTLY_BF16 = 32
|
||||||
LLAMA_FTYPE_GUESSED = 1024
|
LLAMA_FTYPE_GUESSED = 1024
|
||||||
|
|
||||||
# enum llama_rope_scaling_type {
|
# enum llama_rope_scaling_type {
|
||||||
|
@ -494,7 +507,7 @@ class llama_token_data_array(ctypes.Structure):
|
||||||
|
|
||||||
llama_token_data_array_p = ctypes.POINTER(llama_token_data_array)
|
llama_token_data_array_p = ctypes.POINTER(llama_token_data_array)
|
||||||
|
|
||||||
# typedef bool (*llama_progress_callback)(float progress, void *ctx);
|
# typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
||||||
llama_progress_callback = ctypes.CFUNCTYPE(
|
llama_progress_callback = ctypes.CFUNCTYPE(
|
||||||
ctypes.c_bool, ctypes.c_float, ctypes.c_void_p
|
ctypes.c_bool, ctypes.c_float, ctypes.c_void_p
|
||||||
)
|
)
|
||||||
|
@ -635,6 +648,9 @@ class llama_model_kv_override(ctypes.Structure):
|
||||||
# // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
# // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||||
# const float * tensor_split;
|
# const float * tensor_split;
|
||||||
|
|
||||||
|
# // comma separated list of RPC servers to use for offloading
|
||||||
|
# const char * rpc_servers;
|
||||||
|
|
||||||
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||||
# // If the provided progress_callback returns true, model loading continues.
|
# // If the provided progress_callback returns true, model loading continues.
|
||||||
# // If it returns false, model loading is immediately aborted.
|
# // If it returns false, model loading is immediately aborted.
|
||||||
|
@ -661,6 +677,7 @@ class llama_model_params(ctypes.Structure):
|
||||||
split_mode (int): how to split the model across multiple GPUs
|
split_mode (int): how to split the model across multiple GPUs
|
||||||
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
|
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
|
||||||
tensor_split (ctypes.Array[ctypes.ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
tensor_split (ctypes.Array[ctypes.ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||||
|
rpc_servers (ctypes.c_char_p): comma separated list of RPC servers to use for offloading
|
||||||
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
|
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
|
||||||
progress_callback_user_data (ctypes.ctypes.c_void_p): context pointer passed to the progress callback
|
progress_callback_user_data (ctypes.ctypes.c_void_p): context pointer passed to the progress callback
|
||||||
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
|
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
|
||||||
|
@ -674,6 +691,7 @@ class llama_model_params(ctypes.Structure):
|
||||||
split_mode: int
|
split_mode: int
|
||||||
main_gpu: int
|
main_gpu: int
|
||||||
tensor_split: CtypesArray[ctypes.c_float]
|
tensor_split: CtypesArray[ctypes.c_float]
|
||||||
|
rpc_servers: ctypes.c_char_p
|
||||||
progress_callback: Callable[[float, ctypes.c_void_p], bool]
|
progress_callback: Callable[[float, ctypes.c_void_p], bool]
|
||||||
progress_callback_user_data: ctypes.c_void_p
|
progress_callback_user_data: ctypes.c_void_p
|
||||||
kv_overrides: CtypesArray[llama_model_kv_override]
|
kv_overrides: CtypesArray[llama_model_kv_override]
|
||||||
|
@ -687,6 +705,7 @@ class llama_model_params(ctypes.Structure):
|
||||||
("split_mode", ctypes.c_int),
|
("split_mode", ctypes.c_int),
|
||||||
("main_gpu", ctypes.c_int32),
|
("main_gpu", ctypes.c_int32),
|
||||||
("tensor_split", ctypes.POINTER(ctypes.c_float)),
|
("tensor_split", ctypes.POINTER(ctypes.c_float)),
|
||||||
|
("rpc_servers", ctypes.c_char_p),
|
||||||
("progress_callback", llama_progress_callback),
|
("progress_callback", llama_progress_callback),
|
||||||
("progress_callback_user_data", ctypes.c_void_p),
|
("progress_callback_user_data", ctypes.c_void_p),
|
||||||
("kv_overrides", ctypes.POINTER(llama_model_kv_override)),
|
("kv_overrides", ctypes.POINTER(llama_model_kv_override)),
|
||||||
|
|
|
@ -132,6 +132,7 @@ def create_app(
|
||||||
middleware=middleware,
|
middleware=middleware,
|
||||||
title="🦙 llama.cpp Python API",
|
title="🦙 llama.cpp Python API",
|
||||||
version=llama_cpp.__version__,
|
version=llama_cpp.__version__,
|
||||||
|
root_path=server_settings.root_path,
|
||||||
)
|
)
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
CORSMiddleware,
|
CORSMiddleware,
|
||||||
|
@ -274,6 +275,7 @@ async def create_completion(
|
||||||
"best_of",
|
"best_of",
|
||||||
"logit_bias_type",
|
"logit_bias_type",
|
||||||
"user",
|
"user",
|
||||||
|
"min_tokens",
|
||||||
}
|
}
|
||||||
kwargs = body.model_dump(exclude=exclude)
|
kwargs = body.model_dump(exclude=exclude)
|
||||||
|
|
||||||
|
@ -287,6 +289,15 @@ async def create_completion(
|
||||||
if body.grammar is not None:
|
if body.grammar is not None:
|
||||||
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
|
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
|
||||||
|
|
||||||
|
if body.min_tokens > 0:
|
||||||
|
_min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
|
||||||
|
[llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
|
||||||
|
)
|
||||||
|
if "logits_processor" not in kwargs:
|
||||||
|
kwargs["logits_processor"] = _min_tokens_logits_processor
|
||||||
|
else:
|
||||||
|
kwargs["logits_processor"].extend(_min_tokens_logits_processor)
|
||||||
|
|
||||||
iterator_or_completion: Union[
|
iterator_or_completion: Union[
|
||||||
llama_cpp.CreateCompletionResponse,
|
llama_cpp.CreateCompletionResponse,
|
||||||
Iterator[llama_cpp.CreateCompletionStreamResponse],
|
Iterator[llama_cpp.CreateCompletionStreamResponse],
|
||||||
|
@ -444,6 +455,7 @@ async def create_chat_completion(
|
||||||
"n",
|
"n",
|
||||||
"logit_bias_type",
|
"logit_bias_type",
|
||||||
"user",
|
"user",
|
||||||
|
"min_tokens",
|
||||||
}
|
}
|
||||||
kwargs = body.model_dump(exclude=exclude)
|
kwargs = body.model_dump(exclude=exclude)
|
||||||
llama = llama_proxy(body.model)
|
llama = llama_proxy(body.model)
|
||||||
|
@ -457,6 +469,15 @@ async def create_chat_completion(
|
||||||
if body.grammar is not None:
|
if body.grammar is not None:
|
||||||
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
|
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
|
||||||
|
|
||||||
|
if body.min_tokens > 0:
|
||||||
|
_min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
|
||||||
|
[llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
|
||||||
|
)
|
||||||
|
if "logits_processor" not in kwargs:
|
||||||
|
kwargs["logits_processor"] = _min_tokens_logits_processor
|
||||||
|
else:
|
||||||
|
kwargs["logits_processor"].extend(_min_tokens_logits_processor)
|
||||||
|
|
||||||
iterator_or_completion: Union[
|
iterator_or_completion: Union[
|
||||||
llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
|
llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
|
||||||
] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
|
] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
|
||||||
|
|
|
@ -140,6 +140,20 @@ class LlamaProxy:
|
||||||
chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
|
chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
|
||||||
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
||||||
)
|
)
|
||||||
|
elif settings.chat_format == "llama-3-vision-alpha":
|
||||||
|
assert settings.clip_model_path is not None, "clip model not found"
|
||||||
|
if settings.hf_model_repo_id is not None:
|
||||||
|
chat_handler = (
|
||||||
|
llama_cpp.llama_chat_format.Llama3VisionAlpha.from_pretrained(
|
||||||
|
repo_id=settings.hf_model_repo_id,
|
||||||
|
filename=settings.clip_model_path,
|
||||||
|
verbose=settings.verbose,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha(
|
||||||
|
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
||||||
|
)
|
||||||
elif settings.chat_format == "hf-autotokenizer":
|
elif settings.chat_format == "hf-autotokenizer":
|
||||||
assert (
|
assert (
|
||||||
settings.hf_pretrained_model_name_or_path is not None
|
settings.hf_pretrained_model_name_or_path is not None
|
||||||
|
@ -228,6 +242,7 @@ class LlamaProxy:
|
||||||
logits_all=settings.logits_all,
|
logits_all=settings.logits_all,
|
||||||
embedding=settings.embedding,
|
embedding=settings.embedding,
|
||||||
offload_kqv=settings.offload_kqv,
|
offload_kqv=settings.offload_kqv,
|
||||||
|
flash_attn=settings.flash_attn,
|
||||||
# Sampling Params
|
# Sampling Params
|
||||||
last_n_tokens_size=settings.last_n_tokens_size,
|
last_n_tokens_size=settings.last_n_tokens_size,
|
||||||
# LoRA Params
|
# LoRA Params
|
||||||
|
|
|
@ -215,6 +215,10 @@ class ServerSettings(BaseSettings):
|
||||||
default=False,
|
default=False,
|
||||||
description="Disable EventSource pings (may be needed for some clients).",
|
description="Disable EventSource pings (may be needed for some clients).",
|
||||||
)
|
)
|
||||||
|
root_path: str = Field(
|
||||||
|
default="",
|
||||||
|
description="The root path for the server. Useful when running behind a reverse proxy.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class Settings(ServerSettings, ModelSettings):
|
class Settings(ServerSettings, ModelSettings):
|
||||||
|
|
|
@ -16,10 +16,14 @@ max_tokens_field = Field(
|
||||||
default=16, ge=1, description="The maximum number of tokens to generate."
|
default=16, ge=1, description="The maximum number of tokens to generate."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
min_tokens_field = Field(
|
||||||
|
default=0,
|
||||||
|
ge=0,
|
||||||
|
description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).",
|
||||||
|
)
|
||||||
|
|
||||||
temperature_field = Field(
|
temperature_field = Field(
|
||||||
default=0.8,
|
default=0.8,
|
||||||
ge=0.0,
|
|
||||||
le=2.0,
|
|
||||||
description="Adjust the randomness of the generated text.\n\n"
|
description="Adjust the randomness of the generated text.\n\n"
|
||||||
+ "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
|
+ "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
|
||||||
)
|
)
|
||||||
|
@ -113,6 +117,7 @@ class CreateCompletionRequest(BaseModel):
|
||||||
max_tokens: Optional[int] = Field(
|
max_tokens: Optional[int] = Field(
|
||||||
default=16, ge=0, description="The maximum number of tokens to generate."
|
default=16, ge=0, description="The maximum number of tokens to generate."
|
||||||
)
|
)
|
||||||
|
min_tokens: int = min_tokens_field
|
||||||
temperature: float = temperature_field
|
temperature: float = temperature_field
|
||||||
top_p: float = top_p_field
|
top_p: float = top_p_field
|
||||||
min_p: float = min_p_field
|
min_p: float = min_p_field
|
||||||
|
@ -208,6 +213,7 @@ class CreateChatCompletionRequest(BaseModel):
|
||||||
default=None,
|
default=None,
|
||||||
description="The maximum number of tokens to generate. Defaults to inf",
|
description="The maximum number of tokens to generate. Defaults to inf",
|
||||||
)
|
)
|
||||||
|
min_tokens: int = min_tokens_field
|
||||||
logprobs: Optional[bool] = Field(
|
logprobs: Optional[bool] = Field(
|
||||||
default=False,
|
default=False,
|
||||||
description="Whether to output the logprobs or not. Default is True"
|
description="Whether to output the logprobs or not. Default is True"
|
||||||
|
|
|
@ -44,7 +44,9 @@ releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern)
|
||||||
# For each release, get all assets
|
# For each release, get all assets
|
||||||
for release in $releases; do
|
for release in $releases; do
|
||||||
assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets)
|
assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets)
|
||||||
echo " <h2>$release</h2>" >> index.html
|
# Get release version from release ie v0.1.0-cu121 -> v0.1.0
|
||||||
|
release_version=$(echo $release | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
|
||||||
|
echo " <h2>$release_version</h2>" >> index.html
|
||||||
for asset in $(echo $assets | jq -r .[].browser_download_url); do
|
for asset in $(echo $assets | jq -r .[].browser_download_url); do
|
||||||
if [[ $asset == *".whl" ]]; then
|
if [[ $asset == *".whl" ]]; then
|
||||||
echo " <a href=\"$asset\">$asset</a>" >> index.html
|
echo " <a href=\"$asset\">$asset</a>" >> index.html
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961
|
Subproject commit 05834841dcb4f922983ea976539c70472272df9a
|
Loading…
Reference in a new issue