Compare commits

...

13 commits

Author SHA1 Message Date
d3afd4507f
Merge https://github.com/abetlen/llama-cpp-python 2024-03-23 15:47:34 +05:30
Andrei Betlen
c1325dcdfb fix: tool_call missing first token. 2024-03-22 23:44:04 -04:00
Andrei Betlen
e325a831f0 feat: Update llama.cpp 2024-03-22 23:43:29 -04:00
Andrei Betlen
c89be28ef9 feat: Update llama.cpp 2024-03-20 20:50:47 -04:00
Andrei Betlen
3db03b7302 feat: Update llama.cpp 2024-03-20 13:27:43 -04:00
bretello
740f3f3812
fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64 (#1289) 2024-03-20 12:46:09 -04:00
Andrei Betlen
f7decc9562 docs: Add chat examples to openapi ui 2024-03-19 10:52:53 -04:00
Andrei
60d8498f21
feat: Add tools/functions variables to Jinja2ChatFormatter, add function response formatting for all simple chat formats (#1273)
* Add tools/functions variables to Jinja2ChatFormatter

Also fixed missing tools/tool_choices parameters in chat_formatter_to_chat_completion_handler().

* Set grammar when doing explicit function calling

* Add function / tool response for all chat formats

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2024-03-19 04:55:57 -04:00
Andrei Betlen
18d7ce918f feat: Update llama.cpp 2024-03-19 04:40:24 -04:00
Andrei Betlen
7d4a5ec59f Merge branch 'main' of https://github.com/abetlen/llama-cpp-python into main 2024-03-18 11:37:33 -04:00
Andrei Betlen
bf64752535 chore: Bump version 2024-03-18 11:37:30 -04:00
Jeffrey Fong
8a60c7bc8c
fix: Fix and optimize functionary chat handler (#1282)
* fix functionary chat logic

* further fixes

---------

Co-authored-by: Andrei <abetlen@gmail.com>
2024-03-18 10:40:57 -04:00
Andrei Betlen
8d298b4750 feat: Update llama.cpp 2024-03-18 10:26:36 -04:00
7 changed files with 504 additions and 268 deletions

View file

@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
## [0.2.57]
- feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
- fix: set default embedding pooling type to unspecified by @abetlen in 4084aabe867b8ec2aba1b22659e59c9318b0d1f3
- fix: Fix and optimize functionary chat handler by @jeffrey-fong in #1282
- fix: json mode for basic chat formats by @abetlen in 20e6815252d0efd9f015f7adbf108faaf36e3f3c
## [0.2.56]
- feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e

View file

@ -17,6 +17,11 @@ if (LLAMA_BUILD)
set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
endif()
if (APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
set(LLAMA_METAL_EMBED_LIBRARY "On" CACHE BOOL "llama: embed metal library" FORCE)
endif()
add_subdirectory(vendor/llama.cpp)
install(
TARGETS llama

View file

@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *
__version__ = "0.2.56"
__version__ = "0.2.57"

View file

@ -188,6 +188,10 @@ class Jinja2ChatFormatter(ChatFormatter):
self,
*,
messages: List[llama_types.ChatCompletionRequestMessage],
functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
tools: Optional[List[llama_types.ChatCompletionTool]] = None,
tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
**kwargs: Any,
) -> ChatFormatterResponse:
def raise_exception(message: str):
@ -199,6 +203,10 @@ class Jinja2ChatFormatter(ChatFormatter):
bos_token=self.bos_token,
raise_exception=raise_exception,
add_generation_prompt=self.add_generation_prompt,
functions=functions,
function_call=function_call,
tools=tools,
tool_choice=tool_choice,
)
return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token])
@ -288,6 +296,183 @@ def _convert_completion_to_chat(
return _convert_text_completion_to_chat(completion)
def _convert_completion_to_chat_function(
tool_name: str,
completion_or_chunks: Union[
llama_types.CreateCompletionResponse,
Iterator[llama_types.CreateCompletionStreamResponse],
],
stream: bool,
):
if not stream:
completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore
assert "usage" in completion
tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
# TODO: Fix for legacy function calls
chat_completion: llama_types.CreateChatCompletionResponse = {
"id": "chat" + completion["id"],
"object": "chat.completion",
"created": completion["created"],
"model": completion["model"],
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": None,
"function_call": {
"name": tool_name,
"arguments": completion["choices"][0]["text"],
},
"tool_calls": [
{
"id": tool_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": completion["choices"][0]["text"],
},
}
],
},
"finish_reason": "tool_calls",
}
],
"usage": completion["usage"],
}
return chat_completion
else:
chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore
def _stream_response_to_function_stream(
chunks: Iterator[llama_types.CreateCompletionStreamResponse],
) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
# blank first message
first = True
id_ = None
created = None
model = None
tool_id = None
for chunk in chunks:
if first:
id_ = "chat" + chunk["id"]
created = chunk["created"]
model = chunk["model"]
tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
yield {
"id": id_,
"object": "chat.completion.chunk",
"created": created,
"model": model,
"choices": [
{
"index": 0,
"finish_reason": None,
"logprobs": None,
"delta": {
"role": "assistant",
"content": None,
"function_call": None,
"tool_calls": None,
},
}
],
}
yield {
"id": "chat" + chunk["id"],
"object": "chat.completion.chunk",
"created": chunk["created"],
"model": chunk["model"],
"choices": [
{
"index": 0,
"finish_reason": None,
"logprobs": None,
"delta": {
"role": None,
"content": None,
"function_call": {
"name": tool_name,
"arguments": chunk["choices"][0]["text"],
},
"tool_calls": [
{
"index": 0,
"id": tool_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": chunk["choices"][0]["text"],
},
}
],
},
}
],
}
first = False
continue
assert tool_id is not None
yield {
"id": "chat" + chunk["id"],
"object": "chat.completion.chunk",
"created": chunk["created"],
"model": chunk["model"],
"choices": [
{
"index": 0,
"finish_reason": None,
"logprobs": None,
"delta": {
"role": None,
"content": None,
"function_call": {
"name": tool_name,
"arguments": chunk["choices"][0]["text"],
},
"tool_calls": [
{
"index": 0,
"id": tool_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": chunk["choices"][0][
"text"
],
},
}
],
},
}
],
}
if id_ is not None and created is not None and model is not None:
yield {
"id": id_,
"object": "chat.completion.chunk",
"created": created,
"model": model,
"choices": [
{
"index": 0,
"finish_reason": "tool_calls",
"logprobs": None,
"delta": {
"role": None,
"content": None,
"function_call": None,
"tool_calls": None,
},
}
],
}
return _stream_response_to_function_stream(chunks)
def chat_formatter_to_chat_completion_handler(
chat_formatter: ChatFormatter,
) -> LlamaChatCompletionHandler:
@ -331,6 +516,8 @@ def chat_formatter_to_chat_completion_handler(
messages=messages,
functions=functions,
function_call=function_call,
tools=tools,
tool_choice=tool_choice,
)
prompt = result.prompt
if result.stop is not None:
@ -341,6 +528,47 @@ def chat_formatter_to_chat_completion_handler(
if response_format is not None and response_format["type"] == "json_object":
grammar = _grammar_for_response_format(response_format, verbose=llama.verbose)
# Convert legacy functions to tools
if functions is not None:
tools = [
{
"type": "function",
"function": function,
}
for function in functions
]
# Convert legacy function_call to tool_choice
if function_call is not None:
if isinstance(function_call, str) and (
function_call == "none" or function_call == "auto"
):
tool_choice = function_call
if isinstance(function_call, dict) and "name" in function_call:
tool_choice = {
"type": "function",
"function": {
"name": function_call["name"],
},
}
tool = None
if tool_choice is not None and isinstance(tool_choice, dict) and tools is not None:
name = tool_choice["function"]["name"]
tool = next((t for t in tools if t["function"]["name"] == name), None)
if tool is None:
raise ValueError(f"Tool choice '{name}' not found in tools.")
schema = tool["function"]["parameters"]
try:
# create grammar from json schema
grammar = llama_grammar.LlamaGrammar.from_json_schema(
json.dumps(schema), verbose=llama.verbose
)
except Exception as e:
grammar = llama_grammar.LlamaGrammar.from_string(
llama_grammar.JSON_GBNF, verbose=llama.verbose
)
completion_or_chunks = llama.create_completion(
prompt=prompt,
temperature=temperature,
@ -364,6 +592,11 @@ def chat_formatter_to_chat_completion_handler(
grammar=grammar,
logit_bias=logit_bias,
)
if tool is not None:
tool_name = tool["function"]["name"]
return _convert_completion_to_chat_function(
tool_name, completion_or_chunks, stream
)
return _convert_completion_to_chat(completion_or_chunks, stream=stream)
return chat_completion_handler
@ -1596,13 +1829,15 @@ def functionary_v1_v2_chat_handler(
function_call = (
tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
)
else:
function_call = "auto"
prompt = prepare_messages_for_inference(
messages, tokenizer, version, functions, tools
)
# If no tools/functions are provided
if function_call is None and (functions is None or len(functions) == 0):
if function_call == "none" or functions is None or len(functions) == 0:
if version == "v1":
stop = END_ASSISTANT_TOKEN
else:
@ -1630,6 +1865,7 @@ def functionary_v1_v2_chat_handler(
logits_processor=logits_processor,
grammar=grammar,
)
completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore
assert stream is False # TODO: support stream mode
@ -1692,13 +1928,12 @@ def functionary_v1_v2_chat_handler(
return completion
content = ""
function_calls, function_bodies = [], []
if version == "v1":
# If no or "auto" tool_choice/function_call
if function_call is None or (
isinstance(function_call, str) and function_call == "auto"
):
if isinstance(function_call, str) and function_call == "auto":
stops = ["\n", END_ASSISTANT_TOKEN]
# If tool_choice/function_call is "none"
elif isinstance(function_call, str) and function_call == "none":
@ -1747,70 +1982,67 @@ def functionary_v1_v2_chat_handler(
else:
function_bodies.append(completion_text.strip())
else:
# Loop until all parallel function calls are generated
while True:
# If no or "auto" tool_choice/function_call
if function_call is None or (
isinstance(function_call, str) and function_call == "auto"
):
grammar = None
stops = CONTENT_TOKEN
# If tool_choice/function_call is "none"
elif isinstance(function_call, str) and function_call == "none":
prompt = (
prepare_messages_for_inference(messages, tokenizer, version, [], [])
+ "all\n<|content|>"
)
stops = STOP_TOKEN
# If tool_choice/function_call is provided
elif isinstance(function_call, dict):
prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
stops = STOP_TOKEN
function_call = function_call["name"]
function_calls.append(function_call)
grammar = get_grammar(function_call)
else:
prompt = prompt
stops = STOP_TOKEN
# If tool_choice/function_call is "none"
if isinstance(function_call, str) and function_call == "none":
prompt = (
prepare_messages_for_inference(messages, tokenizer, version, [], [])
+ "all\n<|content|>"
)
stops = [STOP_TOKEN, FROM_TOKEN]
completion = create_completion(stop=stops)
completion["choices"][0]["text"] = completion["choices"][0]["text"].strip()
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
# If tool_choice/function_call is provided
elif isinstance(function_call, dict):
prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
function_call = function_call["name"]
function_calls.append(function_call)
grammar = get_grammar(function_call)
stops = [STOP_TOKEN, FROM_TOKEN]
completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"]
# If the generation does not involve a function call
if prompt.endswith("all\n<|content|>") and not completion_text.startswith(
"all"
):
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
# Generate model response if the model decides not to call any function
elif prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all"):
prompt += completion_text + CONTENT_TOKEN
completion = create_completion(stop=STOP_TOKEN)
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
# Generate parameters if model decides to call a function
elif prompt.endswith(RECIPIENT_TOKEN):
function_calls.append(completion_text[:-1])
grammar = get_grammar(function_calls[-1])
completion = create_completion(stop=[STOP_TOKEN, "\n"])
function_bodies.append(completion["choices"][0]["text"].strip())
prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}"
function_bodies.append(completion_text.strip())
# If "auto" or no tool_choice/function_call
elif isinstance(function_call, str) and function_call == "auto":
while True:
# Generate function name first
grammar = None
# Try to generate the beginning of next turn
# If empty completion, break from loop
next_turn_completion_text = create_completion(
stop=[STOP_TOKEN, RECIPIENT_TOKEN]
)["choices"][0]["text"]
if len(next_turn_completion_text) > 0:
prompt += f"\n{FROM_TOKEN}assistant\n{RECIPIENT_TOKEN}"
stops = CONTENT_TOKEN
completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"]
function_name = completion_text.strip()
if function_name == "all":
prompt += "all\n<|content|>"
else:
break
# Break from loop if tool_choice/function_call is provided as a dict
else:
function_bodies.append(completion_text.strip())
break
function_call = completion_text.strip()
prompt += f"{function_call}\n<|content|>"
function_calls.append(function_call)
grammar = get_grammar(function_call)
# Generate content
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"]
if function_name == "all":
content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n")
content = content.lstrip()
# Check whether the model wants to generate another turn
if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
cleaned_completion_text = completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n").strip()
prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
else:
break
else:
function_bodies.append(completion_text.strip())
# Check whether the model wants to generate another turn
prompt += completion_text.strip()
grammar = None
completion = create_completion(stop=stops)
if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
prompt += "\n<|from|>assistant\n<|recipient|>"
else:
break
assert "usage" in completion
assert len(function_calls) > 0
assert len(function_calls) == len(function_bodies)
tool_calls = []
@ -1843,14 +2075,14 @@ def functionary_v1_v2_chat_handler(
"index": 0,
"message": {
"role": "assistant",
"content": None,
"content": None if content == "" else content,
"function_call": {
"name": tool_calls[0]["function"]["name"],
"arguments": tool_calls[0]["function"]["arguments"],
},
"tool_calls": tool_calls,
} if len(tool_calls) > 0 else None,
"tool_calls": tool_calls if len(tool_calls) > 0 else None,
},
"finish_reason": "tool_calls",
"finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
}
],
usage=completion["usage"],
@ -2199,181 +2431,6 @@ def chatml_function_calling(
stream=stream,
)
def _convert_completion_to_chat_function(
tool_name: str,
completion_or_chunks: Union[
llama_types.CreateCompletionResponse,
Iterator[llama_types.CreateCompletionStreamResponse],
],
stream: bool,
):
if not stream:
completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore
assert "usage" in completion
tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
# TODO: Fix for legacy function calls
chat_completion: llama_types.CreateChatCompletionResponse = {
"id": "chat" + completion["id"],
"object": "chat.completion",
"created": completion["created"],
"model": completion["model"],
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": None,
"function_call": {
"name": tool_name,
"arguments": completion["choices"][0]["text"],
},
"tool_calls": [
{
"id": tool_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": completion["choices"][0]["text"],
},
}
],
},
"finish_reason": "tool_calls",
}
],
"usage": completion["usage"],
}
return chat_completion
else:
chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore
def _stream_response_to_function_stream(
chunks: Iterator[llama_types.CreateCompletionStreamResponse],
) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
# blank first message
first = True
id_ = None
created = None
model = None
tool_id = None
for chunk in chunks:
if first:
id_ = "chat" + chunk["id"]
created = chunk["created"]
model = chunk["model"]
tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
yield {
"id": id_,
"object": "chat.completion.chunk",
"created": created,
"model": model,
"choices": [
{
"index": 0,
"finish_reason": None,
"logprobs": None,
"delta": {
"role": "assistant",
"content": None,
"function_call": None,
"tool_calls": None,
},
}
],
}
yield {
"id": "chat" + chunk["id"],
"object": "chat.completion.chunk",
"created": chunk["created"],
"model": chunk["model"],
"choices": [
{
"index": 0,
"finish_reason": None,
"logprobs": None,
"delta": {
"role": None,
"content": None,
"function_call": {
"name": tool_name,
"arguments": chunk["choices"][0]["text"],
},
"tool_calls": [
{
"index": 0,
"id": tool_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": "",
},
}
],
},
}
],
}
first = False
continue
assert tool_id is not None
yield {
"id": "chat" + chunk["id"],
"object": "chat.completion.chunk",
"created": chunk["created"],
"model": chunk["model"],
"choices": [
{
"index": 0,
"finish_reason": None,
"logprobs": None,
"delta": {
"role": None,
"content": None,
"function_call": {
"name": tool_name,
"arguments": chunk["choices"][0]["text"],
},
"tool_calls": [
{
"index": 0,
"id": tool_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": chunk["choices"][0][
"text"
],
},
}
],
},
}
],
}
if id_ is not None and created is not None and model is not None:
yield {
"id": id_,
"object": "chat.completion.chunk",
"created": created,
"model": model,
"choices": [
{
"index": 0,
"finish_reason": "tool_calls",
"logprobs": None,
"delta": {
"role": None,
"content": None,
"function_call": None,
"tool_calls": None,
},
}
],
}
return _stream_response_to_function_stream(chunks)
# Case 2: Tool choice by user
if isinstance(tool_choice, dict):
tool_name = tool_choice["function"]["name"]

View file

@ -581,6 +581,7 @@ class llama_model_params(ctypes.Structure):
# bool embeddings; // if true, extract embeddings (together with logits)
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
# // Abort callback
# // if it returns true, execution of llama_decode() will be aborted
# // currently works only with CPU execution
@ -667,13 +668,15 @@ It might not exist for progress report where '.' is output repeatedly."""
# // model quantization parameters
# typedef struct llama_model_quantize_params {
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
# enum llama_ftype ftype; // quantize to this llama_ftype
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
# bool quantize_output_tensor; // quantize output.weight
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
# bool pure; // quantize all tensors to the default type
# void * imatrix; // pointer to importance matrix data
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
# enum llama_ftype ftype; // quantize to this llama_ftype
# enum ggml_type output_tensor_type; // output tensor type
# enum ggml_type token_embedding_type; // itoken embeddings tensor type
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
# bool quantize_output_tensor; // quantize output.weight
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
# bool pure; // quantize all tensors to the default type
# void * imatrix; // pointer to importance matrix data
# } llama_model_quantize_params;
class llama_model_quantize_params(ctypes.Structure):
"""Parameters for llama_model_quantize
@ -681,16 +684,20 @@ class llama_model_quantize_params(ctypes.Structure):
Attributes:
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
ftype (int): quantize to this llama_ftype
output_tensor_type (int): output tensor type
token_embedding_type (int): itoken embeddings tensor type
allow_requantize (bool): allow quantizing non-f32/f16 tensors
quantize_output_tensor (bool): quantize output.weight
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
pure (bool): quantize all tensors to the default type
imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data
imatrix (ctypes.c_void_p): pointer to importance matrix data
"""
_fields_ = [
("nthread", ctypes.c_int32),
("ftype", ctypes.c_int),
("output_tensor_type", ctypes.c_int),
("token_embedding_type", ctypes.c_int),
("allow_requantize", ctypes.c_bool),
("quantize_output_tensor", ctypes.c_bool),
("only_copy", ctypes.c_bool),
@ -1006,6 +1013,11 @@ def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
def llama_n_embd(model: llama_model_p, /) -> int: ...
# LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
def llama_n_layer(model: llama_model_p, /) -> int: ...
# // Get the model's RoPE frequency scaling factor
# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
@ -1166,12 +1178,18 @@ def llama_model_quantize(
...
# // Apply a LoRA adapter to a loaded model
# // path_base_model is the path to a higher quality model to use as a base for
# // the layers modified by the adapter. Can be NULL to use the current loaded model.
# // The model needs to be reloaded before applying a new adapter, otherwise the adapter
# // will be applied on top of the previous one
# // Returns 0 on success
# LLAMA_API int32_t llama_model_apply_lora_from_file(
# const struct llama_model * model,
# const char * path_lora,
# float scale,
# const char * path_base_model,
# int32_t n_threads);
# const char * path_lora,
# float scale,
# const char * path_base_model,
# int32_t n_threads);
@ctypes_function(
"llama_model_apply_lora_from_file",
[
@ -1190,7 +1208,57 @@ def llama_model_apply_lora_from_file(
path_base_model: Union[ctypes.c_char_p, bytes, None],
n_threads: Union[ctypes.c_int32, int],
/,
) -> int: ...
) -> int:
"""Apply a LoRA adapter to a loaded model
path_base_model is the path to a higher quality model to use as a base for
the layers modified by the adapter. Can be NULL to use the current loaded model.
The model needs to be reloaded before applying a new adapter, otherwise the adapter
will be applied on top of the previous one
Returns 0 on success"""
...
# // Apply a loaded control vector to a llama_context, or if data is NULL, clear
# // the currently loaded vector.
# // n_embd should be the size of a single layer's control, and data should point
# // to an n_embd x n_layers buffer starting from layer 1.
# // il_start and il_end are the layer range the vector should apply to (both inclusive)
# // See llama_control_vector_load in common to load a control vector.
# LLAMA_API int32_t llama_control_vector_apply(
# struct llama_context * lctx,
# const float * data,
# size_t len,
# int32_t n_embd,
# int32_t il_start,
# int32_t il_end);
@ctypes_function(
"llama_control_vector_apply",
[
llama_context_p_ctypes,
ctypes.POINTER(ctypes.c_float),
ctypes.c_size_t,
ctypes.c_int32,
ctypes.c_int32,
ctypes.c_int32,
],
ctypes.c_int32,
)
def llama_control_vector_apply(
lctx: llama_context_p,
data: CtypesPointerOrRef[ctypes.c_float],
len: int,
n_embd: int,
il_start: int,
il_end: int,
/,
) -> int:
"""Apply a loaded control vector to a llama_context, or if data is NULL, clear
the currently loaded vector.
n_embd should be the size of a single layer's control, and data should point
to an n_embd x n_layers buffer starting from layer 1.
il_start and il_end are the layer range the vector should apply to (both inclusive)
See llama_control_vector_load in common to load a control vector."""
...
# //
@ -1205,6 +1273,12 @@ def llama_model_apply_lora_from_file(
# llama_pos pos;
# };
class llama_kv_cache_view_cell(ctypes.Structure):
"""Information associated with an individual cell in the KV cache view.
Attributes:
pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
May be negative if the cell is not populated."""
_fields_ = [("pos", llama_pos)]
@ -1985,7 +2059,7 @@ def llama_tokenize(
/,
) -> int:
"""Convert the provided text into tokens.
Args:
model: The model to use for tokenization.
text: The text to tokenize.
@ -1995,10 +2069,11 @@ def llama_tokenize(
add_bos: Whether to add a beginning-of-sentence token.
special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
Does not insert a leading space.
Returns:
Returns the number of tokens on success, no more than n_tokens_max
Returns a negative number on failure - the number of tokens that would have been returned"""
Returns a negative number on failure - the number of tokens that would have been returned
"""
...
@ -2674,6 +2749,48 @@ def llama_beam_search(
): ...
# /// @details Build a split GGUF final path for this chunk.
# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
# // Returns the split_path length.
# LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
@ctypes_function(
"llama_split_path",
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
ctypes.c_int,
)
def llama_split_path(
split_path: bytes,
maxlen: Union[ctypes.c_size_t, int],
path_prefix: bytes,
split_no: Union[ctypes.c_int, int],
split_count: Union[ctypes.c_int, int],
/,
) -> int:
"""Build a split GGUF final path for this chunk."""
...
# /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
# /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
# // Returns the split_prefix length.
# LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
@ctypes_function(
"llama_split_prefix",
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
ctypes.c_int,
)
def llama_split_prefix(
split_prefix: bytes,
maxlen: Union[ctypes.c_size_t, int],
split_path: bytes,
split_no: Union[ctypes.c_int, int],
split_count: Union[ctypes.c_int, int],
/,
) -> int:
"""Extract the path prefix from the split_path if and only if the split_no and split_count match."""
...
# Performance information

View file

@ -12,14 +12,7 @@ import llama_cpp
import anyio
from anyio.streams.memory import MemoryObjectSendStream
from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
from fastapi import (
Depends,
FastAPI,
APIRouter,
Request,
HTTPException,
status,
)
from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body
from fastapi.middleware import Middleware
from fastapi.middleware.cors import CORSMiddleware
from fastapi.security import HTTPBearer
@ -356,7 +349,64 @@ async def create_embedding(
)
async def create_chat_completion(
request: Request,
body: CreateChatCompletionRequest,
body: CreateChatCompletionRequest = Body(
openapi_examples={
"normal": {
"summary": "Chat Completion",
"value": {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
],
},
},
"json_mode": {
"summary": "JSON Mode",
"value": {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who won the world series in 2020"},
],
"response_format": { "type": "json_object" }
},
},
"tool_calling": {
"summary": "Tool Calling",
"value": {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Extract Jason is 30 years old."},
],
"tools": [
{
"type": "function",
"function": {
"name": "User",
"description": "User record",
"parameters": {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "number"},
},
"required": ["name", "age"],
},
}
}
],
"tool_choice": {
"type": "function",
"function": {
"name": "User",
}
}
},
},
}
),
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
) -> llama_cpp.ChatCompletion:
exclude = {

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652