Compare commits

..

No commits in common. "5b4ad6f4d17180c40f62bb88a6c998446f9ac79c" and "1d177aaaefb2843d50b0b2a58dee84ace7eab054" have entirely different histories.

20 changed files with 193 additions and 807 deletions

View file

@ -8,12 +8,8 @@ updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/" # Location of package manifests
schedule:
interval: "daily"
interval: "weekly"
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"
- package-ecosystem: "docker"
directory: "/"
schedule:
interval: "daily"
interval: "weekly"

View file

@ -29,7 +29,7 @@ jobs:
python -m pip install -e .[all]
- name: Build wheels
uses: pypa/cibuildwheel@v2.18.0
uses: pypa/cibuildwheel@v2.17.0
env:
# disable repair
CIBW_REPAIR_WHEEL_COMMAND: ""
@ -56,7 +56,7 @@ jobs:
platforms: linux/arm64
- name: Build wheels
uses: pypa/cibuildwheel@v2.18.0
uses: pypa/cibuildwheel@v2.17.0
env:
CIBW_SKIP: "*musllinux* pp*"
CIBW_REPAIR_WHEEL_COMMAND: ""

View file

@ -7,61 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
## [0.2.75]
- feat: Update llama.cpp to ggerganov/llama.cpp@13ad16af1231ab2d245d35df3295bcfa23de1305
- fix: segfault for models without eos / bos tokens by @abetlen in d99a6ba607a4885fb00e63e967964aa41bdbbbcb
- feat: add MinTokensLogitProcessor and min_tokens argument to server by @twaka in #1333
- misc: Remove unnecessary metadata lookups by @CISC in #1448
## [0.2.74]
- feat: Update llama.cpp to ggerganov/llama.cpp@b228aba91ac2cd9eb90e9d423ba1d0d20e0117e2
- fix: Enable CUDA backend for llava by @abetlen in 7f59856fa6f3e23f07e12fc15aeb9359dc6c3bb4
- docs: Fix typo in README.md by @yupbank in #1444
## [0.2.73]
- feat: Update llama.cpp to ggerganov/llama.cpp@25c6e82e7a1ad25a42b0894e87d9b5c557409516
- fix: Clear kv cache at beginning of image chat formats to avoid bug when image is evaluated first by @abetlen in ac55d0a175115d1e719672ce1cb1bec776c738b1
## [0.2.72]
- fix(security): Remote Code Execution by Server-Side Template Injection in Model Metadata by @retr0reg in b454f40a9a1787b2b5659cd2cb00819d983185df
- fix(security): Update remaining jinja chat templates to use immutable sandbox by @CISC in #1441
## [0.2.71]
- feat: Update llama.cpp to ggerganov/llama.cpp@911b3900dded9a1cfe0f0e41b82c7a29baf3a217
- fix: Make leading bos_token optional for image chat formats, fix nanollava system message by @abetlen in 77122638b4153e31d9f277b3d905c2900b536632
- fix: free last image embed in llava chat handler by @abetlen in 3757328b703b2cd32dcbd5853271e3a8c8599fe7
## [0.2.70]
- feat: Update llama.cpp to ggerganov/llama.cpp@c0e6fbf8c380718102bd25fcb8d2e55f8f9480d1
- feat: fill-in-middle support by @CISC in #1386
- fix: adding missing args in create_completion for functionary chat handler by @skalade in #1430
- docs: update README.md @eltociear in #1432
- fix: chat_format log where auto-detected format prints None by @balvisio in #1434
- feat(server): Add support for setting root_path by @abetlen in 0318702cdc860999ee70f277425edbbfe0e60419
- feat(ci): Add docker checks and check deps more frequently by @Smartappli in #1426
- fix: detokenization case where first token does not start with a leading space by @noamgat in #1375
- feat: Implement streaming for Functionary v2 + Bug fixes by @jeffrey-fong in #1419
- fix: Use memmove to copy str_value kv_override by @abetlen in 9f7a85571ae80d3b6ddbd3e1bae407b9f1e3448a
- feat(server): Remove temperature bounds checks for server by @abetlen in 0a454bebe67d12a446981eb16028c168ca5faa81
- fix(server): Propagate flash_attn to model load by @dthuerck in #1424
## [0.2.69]
- feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
- feat: Add llama-3-vision-alpha chat format by @abetlen in 31b1d95a6c19f5b615a3286069f181a415f872e8
- fix: Change default verbose value of verbose in image chat format handlers to True to match Llama by @abetlen in 4f01c452b6c738dc56eacac3758119b12c57ea94
- fix: Suppress all logs when verbose=False, use hardcoded fileno's to work in colab notebooks by @abetlen in f116175a5a7c84569c88cad231855c1e6e59ff6e
- fix: UTF-8 handling with grammars by @jsoma in #1415
## [0.2.68]
- feat: Update llama.cpp to ggerganov/llama.cpp@77e15bec6217a39be59b9cc83d6b9afb6b0d8167
- feat: Update llama.cpp to ggerganov/llama.cpp@
- feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
- fix(ci): Fix build-and-release.yaml by @Smartappli in #1413

View file

@ -51,9 +51,8 @@ if (LLAMA_BUILD)
)
if (LLAVA_BUILD)
if (LLAMA_CUBLAS OR LLAMA_CUDA)
if (LLAMA_CUBLAS)
add_compile_definitions(GGML_USE_CUBLAS)
add_compile_definitions(GGML_USE_CUDA)
endif()
if (LLAMA_METAL)

View file

@ -16,7 +16,7 @@ build.debug:
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false --editable .
build.cuda:
CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .
CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --verbose -e .
build.opencl:
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .

View file

@ -516,7 +516,7 @@ chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
llm = Llama(
model_path="./path/to/llava/llama-model.gguf",
chat_handler=chat_handler,
n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
)
llm.create_chat_completion(
messages = [
@ -547,10 +547,10 @@ llm = Llama.from_pretrained(
repo_id="vikhyatk/moondream2",
filename="*text-model*",
chat_handler=chat_handler,
n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
)
response = llm.create_chat_completion(
respoonse = llm.create_chat_completion(
messages = [
{
"role": "user",

View file

@ -1,19 +0,0 @@
This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html).
First, install the requirements:
```bash
$ pip install -r requirements.txt
```
Deploy a GGUF model to Ray Serve with the following command:
```bash
$ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf'
```
This will start an API endpoint at `http://localhost:8000/`. You can query the model like this:
```bash
$ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000
```

View file

@ -1,20 +0,0 @@
from starlette.requests import Request
from typing import Dict
from ray import serve
from ray.serve import Application
from llama_cpp import Llama
@serve.deployment
class LlamaDeployment:
def __init__(self, model_path: str):
self._llm = Llama(model_path=model_path)
async def __call__(self, http_request: Request) -> Dict:
input_json = await http_request.json()
prompt = input_json["prompt"]
max_tokens = input_json.get("max_tokens", 64)
return self._llm(prompt, max_tokens=max_tokens)
def llm_builder(args: Dict[str, str]) -> Application:
return LlamaDeployment.bind(args["model_path"])

View file

@ -1,3 +0,0 @@
ray[serve]
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
llama-cpp-python

View file

@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *
__version__ = "0.2.75"
__version__ = "0.2.68"

View file

@ -203,7 +203,7 @@ class _LlamaModel:
# NOTE: Llama1 models automatically added a space at the start of the prompt
# this line removes a leading space if the first token is a beginning of sentence token
return (
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b' ' else output
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
)
# Extra

View file

@ -262,12 +262,7 @@ class Llama:
raise ValueError(f"Value for {k} is too long: {v}")
v_bytes = v_bytes.ljust(128, b"\0")
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
# copy min(v_bytes, 128) to str_value
ctypes.memmove(
self._kv_overrides_array[i].value.str_value,
v_bytes,
min(len(v_bytes), 128),
)
self._kv_overrides_array[i].value.str_value[:128] = v_bytes
else:
raise ValueError(f"Unknown value type for {k}: {v}")
@ -378,7 +373,6 @@ class Llama:
self.chat_format = chat_format
self.chat_handler = chat_handler
self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = {}
self.draft_model = draft_model
@ -410,33 +404,10 @@ class Llama:
if self.verbose:
print(f"Model metadata: {self.metadata}", file=sys.stderr)
eos_token_id = self.token_eos()
bos_token_id = self.token_bos()
eos_token = self._model.token_get_text(eos_token_id) if eos_token_id != -1 else ""
bos_token = self._model.token_get_text(bos_token_id) if bos_token_id != -1 else ""
# Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates
template_choices = dict((name[10:], template) for name, template in self.metadata.items() if name.startswith("tokenizer.chat_template."))
if "tokenizer.chat_template" in self.metadata:
template_choices["chat_template.default"] = self.metadata["tokenizer.chat_template"]
if self.verbose and template_choices:
print(f"Available chat formats from metadata: {', '.join(template_choices.keys())}", file=sys.stderr)
for name, template in template_choices.items():
self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter(
template=template,
eos_token=eos_token,
bos_token=bos_token,
stop_token_ids=[eos_token_id],
).to_chat_handler()
if (
self.chat_format is None
and self.chat_handler is None
and "chat_template.default" in template_choices
and "tokenizer.chat_template" in self.metadata
):
chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata(
self.metadata
@ -447,17 +418,35 @@ class Llama:
if self.verbose:
print(f"Guessed chat format: {chat_format}", file=sys.stderr)
else:
template = self.metadata["tokenizer.chat_template"]
try:
eos_token_id = int(self.metadata["tokenizer.ggml.eos_token_id"])
except:
eos_token_id = self.token_eos()
try:
bos_token_id = int(self.metadata["tokenizer.ggml.bos_token_id"])
except:
bos_token_id = self.token_bos()
eos_token = self._model.token_get_text(eos_token_id)
bos_token = self._model.token_get_text(bos_token_id)
if self.verbose:
print(f"Using gguf chat template: {template_choices['chat_template.default']}", file=sys.stderr)
print(f"Using gguf chat template: {template}", file=sys.stderr)
print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
self.chat_format = "chat_template.default"
self.chat_handler = llama_chat_format.Jinja2ChatFormatter(
template=template,
eos_token=eos_token,
bos_token=bos_token,
stop_token_ids=[eos_token_id],
).to_chat_handler()
if self.chat_format is None and self.chat_handler is None:
self.chat_format = "llama-2"
if self.verbose:
print(f"Using fallback chat format: {self.chat_format}", file=sys.stderr)
print(f"Using fallback chat format: {chat_format}", file=sys.stderr)
@property
def ctx(self) -> llama_cpp.llama_context_p:
@ -961,54 +950,19 @@ class Llama:
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
created: int = int(time.time())
prefix_token_id: int = self._model.token_prefix()
middle_token_id: int = self._model.token_middle()
suffix_token_id: int = self._model.token_suffix()
# If prompt is empty, initialize completion with BOS token to avoid
# detokenization including a space at the beginning of the completion
completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
# Add blank space to start of prompt to match OG llama tokenizer
prompt_tokens: List[int] = (
(
[prefix_token_id]
if prefix_token_id >= 0 and suffix is not None
else []
)
+
(
(
self.tokenize(prompt.encode("utf-8"), add_bos=(prefix_token_id < 0 or suffix is None), special=(prefix_token_id < 0 or suffix is None))
self.tokenize(prompt.encode("utf-8"), special=True)
if prompt != ""
else (
[]
if prefix_token_id >= 0 and suffix is not None
else [self.token_bos()]
)
)
if isinstance(prompt, str)
else prompt
)
+
(
(
[suffix_token_id]
+
(
self.tokenize(suffix.encode("utf-8"), add_bos=False, special=False)
if suffix
else []
)
)
if suffix_token_id >= 0 and suffix is not None
else []
)
+
(
[middle_token_id]
if middle_token_id >= 0 and suffix is not None
else []
)
)
text: bytes = b""
returned_tokens: int = 0
stop = (
@ -1387,7 +1341,7 @@ class Llama:
if echo:
text_str = prompt + text_str
if suffix_token_id < 0 and suffix is not None:
if suffix is not None:
text_str = text_str + suffix
logprobs_or_none: Optional[CompletionLogprobs] = None
@ -1725,7 +1679,7 @@ class Llama:
Returns:
Generated chat completion or a stream of chat completion chunks.
"""
handler = self.chat_handler or self._chat_handlers.get(self.chat_format) or llama_chat_format.get_chat_completion_handler(
handler = self.chat_handler or llama_chat_format.get_chat_completion_handler(
self.chat_format
)
return handler(
@ -2084,19 +2038,3 @@ class StoppingCriteriaList(List[StoppingCriteria]):
self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
) -> bool:
return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
class MinTokensLogitsProcessor(LogitsProcessor):
def __init__(self, min_tokens: int, token_eos: int):
self.min_tokens = min_tokens
self.token_eos = token_eos
self.prompt_tokens = None
def __call__(
self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
) -> npt.NDArray[np.single]:
if self.prompt_tokens is None:
self.prompt_tokens = len(input_ids)
if len(input_ids) - self.prompt_tokens < self.min_tokens:
scores[self.token_eos] = -np.inf
return scores

View file

@ -11,7 +11,6 @@ from contextlib import ExitStack
from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol, cast
import jinja2
from jinja2.sandbox import ImmutableSandboxedEnvironment
import numpy as np
import numpy.typing as npt
@ -192,7 +191,7 @@ class Jinja2ChatFormatter(ChatFormatter):
self.add_generation_prompt = add_generation_prompt
self.stop_token_ids = set(stop_token_ids) if stop_token_ids is not None else None
self._environment = ImmutableSandboxedEnvironment(
self._environment = jinja2.Environment(
loader=jinja2.BaseLoader(),
trim_blocks=True,
lstrip_blocks=True,
@ -685,7 +684,8 @@ def hf_tokenizer_config_to_chat_formatter(
assert isinstance(tokenizer_config["eos_token"], str)
eos_token = tokenizer_config["eos_token"]
env = ImmutableSandboxedEnvironment(
env = jinja2.Environment(
loader=jinja2.BaseLoader(),
trim_blocks=True,
lstrip_blocks=True,
).from_string(chat_template)
@ -1894,8 +1894,6 @@ def functionary_v1_v2_chat_handler(
function_call = (
tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
)
elif function_call is not None:
pass
else:
function_call = "auto"
@ -1932,10 +1930,11 @@ def functionary_v1_v2_chat_handler(
logits_processor=logits_processor,
grammar=grammar,
)
if stream is False:
completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore
assert stream is False # TODO: support stream mode
def get_grammar(function_call):
function_body = None
for function in functions or []:
@ -1969,7 +1968,7 @@ def functionary_v1_v2_chat_handler(
return grammar
def create_completion(prompt, stop, grammar):
def create_completion(stop):
completion = cast(llama_types.Completion, llama.create_completion(
prompt=prompt,
temperature=temperature,
@ -1977,7 +1976,7 @@ def functionary_v1_v2_chat_handler(
top_k=top_k,
min_p=min_p,
typical_p=typical_p,
stream=stream,
stream=False,
stop=stop,
max_tokens=max_tokens,
presence_penalty=presence_penalty,
@ -1998,315 +1997,6 @@ def functionary_v1_v2_chat_handler(
function_calls, function_bodies = [], []
completion_tokens = 0
def generate_streaming(tools, functions, function_call, prompt):
assert version == "v2", "Streaming for v1 is not supported"
chunk_id, chunk_created = None, None
# If tool_choice/function_call is provided
if isinstance(function_call, dict):
prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
grammar = get_grammar(function_call["name"])
stops = [STOP_TOKEN, FROM_TOKEN]
tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
completion_text = ""
first = True
for chunk in completion:
# Yield the tool/function name first
if first:
if tools is not None:
func_call_dict = {
"tool_calls": [
{
"index": 0,
"id": "call_" + tool_id,
"type": "function",
"function": {"name": function_call["name"], "arguments": ""},
}
]
}
else:
func_call_dict = {"function_call": {"name": function_call["name"], "arguments": ""}}
yield llama_types.CreateChatCompletionStreamResponse(
id="chat" + chunk["id"],
object="chat.completion.chunk",
created=chunk["created"],
model=chunk["model"],
choices=[
{"index": 0, "logprobs": None, "delta": {"role": None, "content": None, **func_call_dict}}
],
)
first = False
if tools is not None:
func_call_dict = {
"tool_calls": [
{
"index": 0,
"id": "call_" + tool_id,
"type": "function",
"function": {
"name": None,
"arguments": chunk["choices"][0]["text"].rstrip(),
},
}
]
}
else:
func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
if len(chunk["choices"][0]["text"].rstrip()) > 0:
yield llama_types.CreateChatCompletionStreamResponse(
id="chat" + chunk["id"],
object="chat.completion.chunk",
created=chunk["created"],
model=chunk["model"],
choices=[
{
"index": 0,
"logprobs": chunk["choices"][0]["logprobs"],
"delta": {
"role": None,
"content": None,
**func_call_dict,
},
}
],
)
# Yield tool_call/function_call stop message
yield llama_types.CreateChatCompletionStreamResponse(
id="chat" + chunk["id"],
object="chat.completion.chunk",
created=chunk["created"],
model=chunk["model"],
choices=[
{
"index": 0,
"finish_reason": "tool_calls" if tools is not None else "function_call",
"logprobs": None,
"delta": {
"role": None, "content": None, "function_call": None, "tool_calls": None
},
}
],
)
# If "auto" or no tool_choice/function_call
elif isinstance(function_call, str) and function_call == "auto":
tool_index = 0
while True:
# Generate function name first
grammar = None
stops = CONTENT_TOKEN
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
completion_text = ""
for chunk in completion:
completion_text += chunk["choices"][0]["text"]
if chunk_id is None:
chunk_id = chunk["id"]
if chunk_created is None:
chunk_created = chunk["created"]
function_name = completion_text.strip()
if function_name == "all":
prompt += "all\n<|content|>"
# Yield the first empty message for content
yield llama_types.CreateChatCompletionStreamResponse(
id="chat" + chunk_id,
model=chunk["model"],
created=chunk_created,
object="chat.completion.chunk",
choices=[
{
"index": 0,
"delta": {"role": "assistant", "content": ""},
"logprobs": None,
"finish_reason": None,
}
],
)
else:
prompt += f"{function_name}\n<|content|>"
grammar = get_grammar(function_name)
tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
if tools is not None:
func_call_dict = {
"tool_calls": [
{
"index": tool_index,
"id": "call_" + tool_id,
"type": "function",
"function": {"name": function_name, "arguments": ""},
}
]
}
else:
func_call_dict = {"function_call": {"name": function_name, "arguments": ""}}
# Stream function name
yield llama_types.CreateChatCompletionStreamResponse(
id="chat" + chunk_id,
object="chat.completion.chunk",
created=chunk_created,
model=chunk["model"],
choices=[
{
"index": 0,
"logprobs": chunk["choices"][0]["logprobs"],
"delta": {
"role": "assistant",
"content": None,
**func_call_dict,
},
}
],
)
# Generate content
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
if function_name == "all":
completion_text = ""
stop_sequence, buffer, is_end = "\n<|from|>assistant\n<|recipient|>", [], False
for i, chunk in enumerate(completion):
completion_text += chunk["choices"][0]["text"]
if is_end:
buffer.append(chunk["choices"][0]["text"].strip(" "))
if stop_sequence.startswith("".join(buffer)):
continue
else:
buffer.pop()
while len(buffer) > 0:
yield llama_types.CreateChatCompletionStreamResponse(
id="chat" + chunk_id,
object="chat.completion.chunk",
created=chunk_created,
model=chunk["model"],
choices=[
{
"index": 0,
"logprobs": chunk["choices"][0]["logprobs"],
"delta": {
"role": "assistant", "content": buffer.pop(0)
},
}
],
)
is_end = False
elif chunk["choices"][0]["text"] == "\n":
is_end = True
buffer.append(chunk["choices"][0]["text"].strip(" "))
continue
if len(buffer) == 0 and len(chunk["choices"][0]["text"]) > 0:
yield llama_types.CreateChatCompletionStreamResponse(
id="chat" + chunk_id,
object="chat.completion.chunk",
created=chunk_created,
model=chunk["model"],
choices=[
{
"index": 0,
"logprobs": chunk["choices"][0]["logprobs"],
"delta": {
"role": "assistant",
"content": chunk["choices"][0]["text"] if i > 0 else chunk["choices"][0]["text"].lstrip()
},
}
],
)
# Check whether the model wants to generate another turn
if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
if completion_text.endswith("\n<|from|>assistant\n"):
cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
elif completion_text.endswith("\n<|from|> assistant\n"):
cleaned_completion_text = completion_text[:-len("\n<|from|> assistant\n")].strip()
else:
cleaned_completion_text = completion_text.strip()
prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
else:
# Yield stop message
yield llama_types.CreateChatCompletionStreamResponse(
id="chat" + chunk_id,
model=chunk["model"],
created=chunk_created,
object="chat.completion.chunk",
choices=[
{
"index": 0,
"delta": {},
"logprobs": None,
"finish_reason": "stop",
}
],
)
break
else:
# Check whether the model wants to generate another turn
completion_text = ""
for chunk in completion:
completion_text += chunk["choices"][0]["text"]
if len(chunk["choices"][0]["text"].rstrip()) > 0:
if tools is not None:
func_call_dict = {
"tool_calls": [
{
"index": tool_index,
"id": "call_" + tool_id,
"type": "function",
"function": {
"name": None,
"arguments": chunk["choices"][0]["text"].rstrip(),
},
}
]
}
else:
func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
yield llama_types.CreateChatCompletionStreamResponse(
id="chat" + chunk_id,
object="chat.completion.chunk",
created=chunk_created,
model=chunk["model"],
choices=[
{
"index": 0,
"logprobs": chunk["choices"][0]["logprobs"],
"delta": {
"role": None,
"content": None,
**func_call_dict,
},
}
],
)
prompt += completion_text.strip()
grammar = None
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
completion_text += "".join([chunk["choices"][0]["text"] for chunk in completion])
if ("<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text) and tools is not None:
prompt += "\n<|from|>assistant\n<|recipient|>"
tool_index += 1
else:
# Yield tool_call/function_call stop message
yield llama_types.CreateChatCompletionStreamResponse(
id="chat" + chunk_id,
object="chat.completion.chunk",
created=chunk_created,
model=chunk["model"],
choices=[
{
"index": 0,
"finish_reason": "tool_calls" if tools is not None else "function_call",
"logprobs": None,
"delta": {
"role": None, "content": None, "function_call": None, "tool_calls": None
},
}
],
)
break
if stream is not False:
return generate_streaming(
tools=tools, functions=functions, function_call=function_call, prompt=prompt
)
else:
if version == "v1":
# If no or "auto" tool_choice/function_call
if isinstance(function_call, str) and function_call == "auto":
@ -2322,7 +2012,7 @@ def functionary_v1_v2_chat_handler(
prompt = prompt
stops = ["\n", END_ASSISTANT_TOKEN]
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"]
completion_tokens += completion["usage"]["completion_tokens"]
@ -2349,7 +2039,7 @@ def functionary_v1_v2_chat_handler(
completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
)
grammar = get_grammar(function_calls[-1])
completion = create_completion(prompt=prompt, stop=END_FUNCTION_CALL_TOKEN, grammar=grammar)
completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
completion_tokens += completion["usage"]["completion_tokens"]
function_bodies.append(completion["choices"][0]["text"].strip())
# If the prompt involves a function call, just append generated parameters to function_bodies
@ -2363,7 +2053,7 @@ def functionary_v1_v2_chat_handler(
function_calls.append(function_call)
grammar = get_grammar(function_call)
stops = [STOP_TOKEN, FROM_TOKEN]
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"]
completion_tokens += completion["usage"]["completion_tokens"]
function_bodies.append(completion_text.strip())
@ -2373,7 +2063,7 @@ def functionary_v1_v2_chat_handler(
# Generate function name first
grammar = None
stops = CONTENT_TOKEN
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"]
completion_tokens += completion["usage"]["completion_tokens"]
function_name = completion_text.strip()
@ -2386,7 +2076,7 @@ def functionary_v1_v2_chat_handler(
grammar = get_grammar(function_call)
# Generate content
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"]
completion_tokens += completion["usage"]["completion_tokens"]
if function_name == "all":
@ -2413,7 +2103,7 @@ def functionary_v1_v2_chat_handler(
# Check whether the model wants to generate another turn
prompt += completion_text.strip()
grammar = None
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
completion = create_completion(stop=stops)
completion_tokens += completion["usage"]["completion_tokens"]
if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
prompt += "\n<|from|>assistant\n<|recipient|>"
@ -2475,7 +2165,7 @@ def functionary_v1_v2_chat_handler(
class Llava15ChatHandler:
DEFAULT_SYSTEM_MESSAGE: Optional[str] = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
DEFAULT_SYSTEM_MESSAGE = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
CHAT_FORMAT = (
"{% for message in messages %}"
@ -2598,31 +2288,18 @@ class Llava15ChatHandler:
assert self.clip_ctx is not None
system_prompt = _get_system_message(messages)
if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
if system_prompt == "":
messages = [llama_types.ChatCompletionRequestSystemMessage(role="system", content=self.DEFAULT_SYSTEM_MESSAGE)] + messages
image_urls = self.get_image_urls(messages)
template = ImmutableSandboxedEnvironment(
trim_blocks=True,
lstrip_blocks=True,
).from_string(self.CHAT_FORMAT)
text = template.render(
messages=messages,
add_generation_prompt=True,
eos_token=llama.detokenize([llama.token_eos()]),
bos_token=llama.detokenize([llama.token_bos()]),
)
template = jinja2.Template(self.CHAT_FORMAT)
text = template.render(messages=messages, add_generation_prompt=True)
split_text = self.split_text_on_image_urls(text, image_urls)
def embed_image_bytes(image_bytes: bytes):
if self._last_image_embed is not None and self._last_image_hash is not None and hash(image_bytes) == self._last_image_hash:
return self._last_image_embed
with suppress_stdout_stderr(disable=self.verbose):
# Free the previous image embed
if self._last_image_embed is not None:
self._llava_cpp.llava_image_embed_free(self._last_image_embed)
self._last_image_embed = None
self._last_image_hash = None
embed = (
self._llava_cpp.llava_image_embed_make_with_bytes(
self.clip_ctx,
@ -2637,10 +2314,9 @@ class Llava15ChatHandler:
# Evaluate prompt
llama.reset()
llama._ctx.kv_cache_clear()
for type_, value in split_text:
for i, (type_, value) in enumerate(split_text):
if type_ == "text":
tokens = llama.tokenize(value.encode("utf8"), add_bos=False, special=True)
tokens = llama.tokenize(value.encode("utf8"), add_bos=i == 0)
if llama.n_tokens + len(tokens) > llama.n_ctx():
raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
llama.eval(tokens)
@ -2658,8 +2334,6 @@ class Llava15ChatHandler:
llama.n_batch,
n_past_p,
)
# Required to avoid issues with hf tokenizer
llama.input_ids[llama.n_tokens : n_past.value] = -1
llama.n_tokens = n_past.value
# Get prompt tokens to avoid a cache miss
@ -3049,7 +2723,6 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
# Answer the question<|im_end|><|im_start|>user
# <image>
# What is the picture about?<|im_end|><|im_start|>assistant
DEFAULT_SYSTEM_MESSAGE = "Answer the question"
CHAT_FORMAT = (
"{% for message in messages %}"
@ -3098,66 +2771,6 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
"{% endif %}"
)
class Llama3VisionAlpha(Llava15ChatHandler):
# question = "<image>" + q
# prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
DEFAULT_SYSTEM_MESSAGE = None
CHAT_FORMAT = (
"{% for message in messages %}"
"<|start_header_id|>"
"{% if message.role == 'user' %}"
"user<|end_header_id|>\n\n"
"{% if message.content is iterable %}"
# <image>
"{% for content in message.content %}"
"{% if content.type == 'image_url' %}"
"{% if content.image_url is string %}"
"{{ content.image_url }}"
"{% endif %}"
"{% if content.image_url is mapping %}"
"{{ content.image_url.url }}"
"{% endif %}"
"{% endif %}"
"{% endfor %}"
# Question:
"{% for content in message.content %}"
"{% if content.type == 'text' %}"
"{{ content.text }}"
"{% endif %}"
"{% endfor %}"
"{% endif %}"
# Question:
"{% if message.content is string %}"
"{{ message.content }}"
"{% endif %}"
"{% endif %}"
# Answer:
"{% if message.role == 'assistant' %}"
"assistant<|end_header_id|>\n\n"
"{{ message.content }}"
"{% endif %}"
"<|eot_id|>"
"{% endfor %}"
# Generation prompt
"{% if add_generation_prompt %}"
"<|start_header_id|>assistant<|end_header_id|>\n\n"
"{% endif %}"
)
@register_chat_completion_handler("chatml-function-calling")
def chatml_function_calling(
@ -3245,7 +2858,8 @@ def chatml_function_calling(
"{% endfor %}"
"{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
)
template_renderer = ImmutableSandboxedEnvironment(
template_renderer = jinja2.Environment(
loader=jinja2.BaseLoader(),
autoescape=jinja2.select_autoescape(["html", "xml"]),
undefined=jinja2.StrictUndefined,
).from_string(function_calling_template)

View file

@ -294,11 +294,6 @@ LLAMA_VOCAB_TYPE_WPM = 3
# LLAMA_VOCAB_PRE_TYPE_MPT = 5,
# LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
# LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
# LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
# LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
# LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
# LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
# LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
# };
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@ -308,11 +303,6 @@ LLAMA_VOCAB_PRE_TYPE_FALCON = 4
LLAMA_VOCAB_PRE_TYPE_MPT = 5
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
LLAMA_VOCAB_PRE_TYPE_REFACT = 8
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10
LLAMA_VOCAB_PRE_TYPE_OLMO = 11
LLAMA_VOCAB_PRE_TYPE_DBRX = 12
# // note: these values should be synchronized with ggml_rope
@ -381,7 +371,6 @@ LLAMA_TOKEN_TYPE_BYTE = 6
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
# };
@ -414,8 +403,6 @@ LLAMA_FTYPE_MOSTLY_IQ3_M = 27
LLAMA_FTYPE_MOSTLY_IQ2_S = 28
LLAMA_FTYPE_MOSTLY_IQ2_M = 29
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30
LLAMA_FTYPE_MOSTLY_IQ1_M = 31
LLAMA_FTYPE_MOSTLY_BF16 = 32
LLAMA_FTYPE_GUESSED = 1024
# enum llama_rope_scaling_type {
@ -507,7 +494,7 @@ class llama_token_data_array(ctypes.Structure):
llama_token_data_array_p = ctypes.POINTER(llama_token_data_array)
# typedef bool (*llama_progress_callback)(float progress, void * user_data);
# typedef bool (*llama_progress_callback)(float progress, void *ctx);
llama_progress_callback = ctypes.CFUNCTYPE(
ctypes.c_bool, ctypes.c_float, ctypes.c_void_p
)
@ -648,9 +635,6 @@ class llama_model_kv_override(ctypes.Structure):
# // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
# const float * tensor_split;
# // comma separated list of RPC servers to use for offloading
# const char * rpc_servers;
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
# // If the provided progress_callback returns true, model loading continues.
# // If it returns false, model loading is immediately aborted.
@ -677,7 +661,6 @@ class llama_model_params(ctypes.Structure):
split_mode (int): how to split the model across multiple GPUs
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
tensor_split (ctypes.Array[ctypes.ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
rpc_servers (ctypes.c_char_p): comma separated list of RPC servers to use for offloading
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
progress_callback_user_data (ctypes.ctypes.c_void_p): context pointer passed to the progress callback
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
@ -691,7 +674,6 @@ class llama_model_params(ctypes.Structure):
split_mode: int
main_gpu: int
tensor_split: CtypesArray[ctypes.c_float]
rpc_servers: ctypes.c_char_p
progress_callback: Callable[[float, ctypes.c_void_p], bool]
progress_callback_user_data: ctypes.c_void_p
kv_overrides: CtypesArray[llama_model_kv_override]
@ -705,7 +687,6 @@ class llama_model_params(ctypes.Structure):
("split_mode", ctypes.c_int),
("main_gpu", ctypes.c_int32),
("tensor_split", ctypes.POINTER(ctypes.c_float)),
("rpc_servers", ctypes.c_char_p),
("progress_callback", llama_progress_callback),
("progress_callback_user_data", ctypes.c_void_p),
("kv_overrides", ctypes.POINTER(llama_model_kv_override)),

View file

@ -132,7 +132,6 @@ def create_app(
middleware=middleware,
title="🦙 llama.cpp Python API",
version=llama_cpp.__version__,
root_path=server_settings.root_path,
)
app.add_middleware(
CORSMiddleware,
@ -275,7 +274,6 @@ async def create_completion(
"best_of",
"logit_bias_type",
"user",
"min_tokens",
}
kwargs = body.model_dump(exclude=exclude)
@ -289,15 +287,6 @@ async def create_completion(
if body.grammar is not None:
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
if body.min_tokens > 0:
_min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
[llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
)
if "logits_processor" not in kwargs:
kwargs["logits_processor"] = _min_tokens_logits_processor
else:
kwargs["logits_processor"].extend(_min_tokens_logits_processor)
iterator_or_completion: Union[
llama_cpp.CreateCompletionResponse,
Iterator[llama_cpp.CreateCompletionStreamResponse],
@ -455,7 +444,6 @@ async def create_chat_completion(
"n",
"logit_bias_type",
"user",
"min_tokens",
}
kwargs = body.model_dump(exclude=exclude)
llama = llama_proxy(body.model)
@ -469,15 +457,6 @@ async def create_chat_completion(
if body.grammar is not None:
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
if body.min_tokens > 0:
_min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
[llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
)
if "logits_processor" not in kwargs:
kwargs["logits_processor"] = _min_tokens_logits_processor
else:
kwargs["logits_processor"].extend(_min_tokens_logits_processor)
iterator_or_completion: Union[
llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
] = await run_in_threadpool(llama.create_chat_completion, **kwargs)

View file

@ -140,20 +140,6 @@ class LlamaProxy:
chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
clip_model_path=settings.clip_model_path, verbose=settings.verbose
)
elif settings.chat_format == "llama-3-vision-alpha":
assert settings.clip_model_path is not None, "clip model not found"
if settings.hf_model_repo_id is not None:
chat_handler = (
llama_cpp.llama_chat_format.Llama3VisionAlpha.from_pretrained(
repo_id=settings.hf_model_repo_id,
filename=settings.clip_model_path,
verbose=settings.verbose,
)
)
else:
chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha(
clip_model_path=settings.clip_model_path, verbose=settings.verbose
)
elif settings.chat_format == "hf-autotokenizer":
assert (
settings.hf_pretrained_model_name_or_path is not None
@ -242,7 +228,6 @@ class LlamaProxy:
logits_all=settings.logits_all,
embedding=settings.embedding,
offload_kqv=settings.offload_kqv,
flash_attn=settings.flash_attn,
# Sampling Params
last_n_tokens_size=settings.last_n_tokens_size,
# LoRA Params

View file

@ -215,10 +215,6 @@ class ServerSettings(BaseSettings):
default=False,
description="Disable EventSource pings (may be needed for some clients).",
)
root_path: str = Field(
default="",
description="The root path for the server. Useful when running behind a reverse proxy.",
)
class Settings(ServerSettings, ModelSettings):

View file

@ -16,14 +16,10 @@ max_tokens_field = Field(
default=16, ge=1, description="The maximum number of tokens to generate."
)
min_tokens_field = Field(
default=0,
ge=0,
description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).",
)
temperature_field = Field(
default=0.8,
ge=0.0,
le=2.0,
description="Adjust the randomness of the generated text.\n\n"
+ "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
)
@ -117,7 +113,6 @@ class CreateCompletionRequest(BaseModel):
max_tokens: Optional[int] = Field(
default=16, ge=0, description="The maximum number of tokens to generate."
)
min_tokens: int = min_tokens_field
temperature: float = temperature_field
top_p: float = top_p_field
min_p: float = min_p_field
@ -213,7 +208,6 @@ class CreateChatCompletionRequest(BaseModel):
default=None,
description="The maximum number of tokens to generate. Defaults to inf",
)
min_tokens: int = min_tokens_field
logprobs: Optional[bool] = Field(
default=False,
description="Whether to output the logprobs or not. Default is True"

View file

@ -44,9 +44,7 @@ releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern)
# For each release, get all assets
for release in $releases; do
assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets)
# Get release version from release ie v0.1.0-cu121 -> v0.1.0
release_version=$(echo $release | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
echo " <h2>$release_version</h2>" >> index.html
echo " <h2>$release</h2>" >> index.html
for asset in $(echo $assets | jq -r .[].browser_download_url); do
if [[ $asset == *".whl" ]]; then
echo " <a href=\"$asset\">$asset</a>" >> index.html

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 05834841dcb4f922983ea976539c70472272df9a
Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961