This commit is contained in:
baalajimaestro 2024-03-23 15:47:34 +05:30
commit d3afd4507f
Signed by: baalajimaestro
GPG key ID: F93C394FE9BBAFD5
7 changed files with 504 additions and 268 deletions

View file

@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
## [0.2.57]
- feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
- fix: set default embedding pooling type to unspecified by @abetlen in 4084aabe867b8ec2aba1b22659e59c9318b0d1f3
- fix: Fix and optimize functionary chat handler by @jeffrey-fong in #1282
- fix: json mode for basic chat formats by @abetlen in 20e6815252d0efd9f015f7adbf108faaf36e3f3c
## [0.2.56] ## [0.2.56]
- feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e - feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e

View file

@ -17,6 +17,11 @@ if (LLAMA_BUILD)
set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE) set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE) set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
endif() endif()
if (APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
set(LLAMA_METAL_EMBED_LIBRARY "On" CACHE BOOL "llama: embed metal library" FORCE)
endif()
add_subdirectory(vendor/llama.cpp) add_subdirectory(vendor/llama.cpp)
install( install(
TARGETS llama TARGETS llama

View file

@ -1,4 +1,4 @@
from .llama_cpp import * from .llama_cpp import *
from .llama import * from .llama import *
__version__ = "0.2.56" __version__ = "0.2.57"

View file

@ -188,6 +188,10 @@ class Jinja2ChatFormatter(ChatFormatter):
self, self,
*, *,
messages: List[llama_types.ChatCompletionRequestMessage], messages: List[llama_types.ChatCompletionRequestMessage],
functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
tools: Optional[List[llama_types.ChatCompletionTool]] = None,
tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
**kwargs: Any, **kwargs: Any,
) -> ChatFormatterResponse: ) -> ChatFormatterResponse:
def raise_exception(message: str): def raise_exception(message: str):
@ -199,6 +203,10 @@ class Jinja2ChatFormatter(ChatFormatter):
bos_token=self.bos_token, bos_token=self.bos_token,
raise_exception=raise_exception, raise_exception=raise_exception,
add_generation_prompt=self.add_generation_prompt, add_generation_prompt=self.add_generation_prompt,
functions=functions,
function_call=function_call,
tools=tools,
tool_choice=tool_choice,
) )
return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token]) return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token])
@ -288,6 +296,183 @@ def _convert_completion_to_chat(
return _convert_text_completion_to_chat(completion) return _convert_text_completion_to_chat(completion)
def _convert_completion_to_chat_function(
tool_name: str,
completion_or_chunks: Union[
llama_types.CreateCompletionResponse,
Iterator[llama_types.CreateCompletionStreamResponse],
],
stream: bool,
):
if not stream:
completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore
assert "usage" in completion
tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
# TODO: Fix for legacy function calls
chat_completion: llama_types.CreateChatCompletionResponse = {
"id": "chat" + completion["id"],
"object": "chat.completion",
"created": completion["created"],
"model": completion["model"],
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": None,
"function_call": {
"name": tool_name,
"arguments": completion["choices"][0]["text"],
},
"tool_calls": [
{
"id": tool_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": completion["choices"][0]["text"],
},
}
],
},
"finish_reason": "tool_calls",
}
],
"usage": completion["usage"],
}
return chat_completion
else:
chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore
def _stream_response_to_function_stream(
chunks: Iterator[llama_types.CreateCompletionStreamResponse],
) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
# blank first message
first = True
id_ = None
created = None
model = None
tool_id = None
for chunk in chunks:
if first:
id_ = "chat" + chunk["id"]
created = chunk["created"]
model = chunk["model"]
tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
yield {
"id": id_,
"object": "chat.completion.chunk",
"created": created,
"model": model,
"choices": [
{
"index": 0,
"finish_reason": None,
"logprobs": None,
"delta": {
"role": "assistant",
"content": None,
"function_call": None,
"tool_calls": None,
},
}
],
}
yield {
"id": "chat" + chunk["id"],
"object": "chat.completion.chunk",
"created": chunk["created"],
"model": chunk["model"],
"choices": [
{
"index": 0,
"finish_reason": None,
"logprobs": None,
"delta": {
"role": None,
"content": None,
"function_call": {
"name": tool_name,
"arguments": chunk["choices"][0]["text"],
},
"tool_calls": [
{
"index": 0,
"id": tool_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": chunk["choices"][0]["text"],
},
}
],
},
}
],
}
first = False
continue
assert tool_id is not None
yield {
"id": "chat" + chunk["id"],
"object": "chat.completion.chunk",
"created": chunk["created"],
"model": chunk["model"],
"choices": [
{
"index": 0,
"finish_reason": None,
"logprobs": None,
"delta": {
"role": None,
"content": None,
"function_call": {
"name": tool_name,
"arguments": chunk["choices"][0]["text"],
},
"tool_calls": [
{
"index": 0,
"id": tool_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": chunk["choices"][0][
"text"
],
},
}
],
},
}
],
}
if id_ is not None and created is not None and model is not None:
yield {
"id": id_,
"object": "chat.completion.chunk",
"created": created,
"model": model,
"choices": [
{
"index": 0,
"finish_reason": "tool_calls",
"logprobs": None,
"delta": {
"role": None,
"content": None,
"function_call": None,
"tool_calls": None,
},
}
],
}
return _stream_response_to_function_stream(chunks)
def chat_formatter_to_chat_completion_handler( def chat_formatter_to_chat_completion_handler(
chat_formatter: ChatFormatter, chat_formatter: ChatFormatter,
) -> LlamaChatCompletionHandler: ) -> LlamaChatCompletionHandler:
@ -331,6 +516,8 @@ def chat_formatter_to_chat_completion_handler(
messages=messages, messages=messages,
functions=functions, functions=functions,
function_call=function_call, function_call=function_call,
tools=tools,
tool_choice=tool_choice,
) )
prompt = result.prompt prompt = result.prompt
if result.stop is not None: if result.stop is not None:
@ -341,6 +528,47 @@ def chat_formatter_to_chat_completion_handler(
if response_format is not None and response_format["type"] == "json_object": if response_format is not None and response_format["type"] == "json_object":
grammar = _grammar_for_response_format(response_format, verbose=llama.verbose) grammar = _grammar_for_response_format(response_format, verbose=llama.verbose)
# Convert legacy functions to tools
if functions is not None:
tools = [
{
"type": "function",
"function": function,
}
for function in functions
]
# Convert legacy function_call to tool_choice
if function_call is not None:
if isinstance(function_call, str) and (
function_call == "none" or function_call == "auto"
):
tool_choice = function_call
if isinstance(function_call, dict) and "name" in function_call:
tool_choice = {
"type": "function",
"function": {
"name": function_call["name"],
},
}
tool = None
if tool_choice is not None and isinstance(tool_choice, dict) and tools is not None:
name = tool_choice["function"]["name"]
tool = next((t for t in tools if t["function"]["name"] == name), None)
if tool is None:
raise ValueError(f"Tool choice '{name}' not found in tools.")
schema = tool["function"]["parameters"]
try:
# create grammar from json schema
grammar = llama_grammar.LlamaGrammar.from_json_schema(
json.dumps(schema), verbose=llama.verbose
)
except Exception as e:
grammar = llama_grammar.LlamaGrammar.from_string(
llama_grammar.JSON_GBNF, verbose=llama.verbose
)
completion_or_chunks = llama.create_completion( completion_or_chunks = llama.create_completion(
prompt=prompt, prompt=prompt,
temperature=temperature, temperature=temperature,
@ -364,6 +592,11 @@ def chat_formatter_to_chat_completion_handler(
grammar=grammar, grammar=grammar,
logit_bias=logit_bias, logit_bias=logit_bias,
) )
if tool is not None:
tool_name = tool["function"]["name"]
return _convert_completion_to_chat_function(
tool_name, completion_or_chunks, stream
)
return _convert_completion_to_chat(completion_or_chunks, stream=stream) return _convert_completion_to_chat(completion_or_chunks, stream=stream)
return chat_completion_handler return chat_completion_handler
@ -1596,13 +1829,15 @@ def functionary_v1_v2_chat_handler(
function_call = ( function_call = (
tool_choice if isinstance(tool_choice, str) else tool_choice["function"] tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
) )
else:
function_call = "auto"
prompt = prepare_messages_for_inference( prompt = prepare_messages_for_inference(
messages, tokenizer, version, functions, tools messages, tokenizer, version, functions, tools
) )
# If no tools/functions are provided # If no tools/functions are provided
if function_call is None and (functions is None or len(functions) == 0): if function_call == "none" or functions is None or len(functions) == 0:
if version == "v1": if version == "v1":
stop = END_ASSISTANT_TOKEN stop = END_ASSISTANT_TOKEN
else: else:
@ -1630,6 +1865,7 @@ def functionary_v1_v2_chat_handler(
logits_processor=logits_processor, logits_processor=logits_processor,
grammar=grammar, grammar=grammar,
) )
completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore
assert stream is False # TODO: support stream mode assert stream is False # TODO: support stream mode
@ -1692,13 +1928,12 @@ def functionary_v1_v2_chat_handler(
return completion return completion
content = ""
function_calls, function_bodies = [], [] function_calls, function_bodies = [], []
if version == "v1": if version == "v1":
# If no or "auto" tool_choice/function_call # If no or "auto" tool_choice/function_call
if function_call is None or ( if isinstance(function_call, str) and function_call == "auto":
isinstance(function_call, str) and function_call == "auto"
):
stops = ["\n", END_ASSISTANT_TOKEN] stops = ["\n", END_ASSISTANT_TOKEN]
# If tool_choice/function_call is "none" # If tool_choice/function_call is "none"
elif isinstance(function_call, str) and function_call == "none": elif isinstance(function_call, str) and function_call == "none":
@ -1747,70 +1982,67 @@ def functionary_v1_v2_chat_handler(
else: else:
function_bodies.append(completion_text.strip()) function_bodies.append(completion_text.strip())
else: else:
# Loop until all parallel function calls are generated
while True:
# If no or "auto" tool_choice/function_call
if function_call is None or (
isinstance(function_call, str) and function_call == "auto"
):
grammar = None
stops = CONTENT_TOKEN
# If tool_choice/function_call is "none" # If tool_choice/function_call is "none"
elif isinstance(function_call, str) and function_call == "none": if isinstance(function_call, str) and function_call == "none":
prompt = ( prompt = (
prepare_messages_for_inference(messages, tokenizer, version, [], []) prepare_messages_for_inference(messages, tokenizer, version, [], [])
+ "all\n<|content|>" + "all\n<|content|>"
) )
stops = STOP_TOKEN stops = [STOP_TOKEN, FROM_TOKEN]
completion = create_completion(stop=stops)
completion["choices"][0]["text"] = completion["choices"][0]["text"].strip()
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
# If tool_choice/function_call is provided # If tool_choice/function_call is provided
elif isinstance(function_call, dict): elif isinstance(function_call, dict):
prompt += f"{function_call['name']}\n{CONTENT_TOKEN}" prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
stops = STOP_TOKEN
function_call = function_call["name"] function_call = function_call["name"]
function_calls.append(function_call) function_calls.append(function_call)
grammar = get_grammar(function_call) grammar = get_grammar(function_call)
else: stops = [STOP_TOKEN, FROM_TOKEN]
prompt = prompt
stops = STOP_TOKEN
completion = create_completion(stop=stops) completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"] completion_text = completion["choices"][0]["text"]
function_bodies.append(completion_text.strip())
# If the generation does not involve a function call # If "auto" or no tool_choice/function_call
if prompt.endswith("all\n<|content|>") and not completion_text.startswith( elif isinstance(function_call, str) and function_call == "auto":
"all" while True:
): # Generate function name first
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
# Generate model response if the model decides not to call any function
elif prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all"):
prompt += completion_text + CONTENT_TOKEN
completion = create_completion(stop=STOP_TOKEN)
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
# Generate parameters if model decides to call a function
elif prompt.endswith(RECIPIENT_TOKEN):
function_calls.append(completion_text[:-1])
grammar = get_grammar(function_calls[-1])
completion = create_completion(stop=[STOP_TOKEN, "\n"])
function_bodies.append(completion["choices"][0]["text"].strip())
prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}"
grammar = None grammar = None
stops = CONTENT_TOKEN
# Try to generate the beginning of next turn completion = create_completion(stop=stops)
# If empty completion, break from loop completion_text = completion["choices"][0]["text"]
next_turn_completion_text = create_completion( function_name = completion_text.strip()
stop=[STOP_TOKEN, RECIPIENT_TOKEN] if function_name == "all":
)["choices"][0]["text"] prompt += "all\n<|content|>"
if len(next_turn_completion_text) > 0: else:
prompt += f"\n{FROM_TOKEN}assistant\n{RECIPIENT_TOKEN}" function_call = completion_text.strip()
prompt += f"{function_call}\n<|content|>"
function_calls.append(function_call)
grammar = get_grammar(function_call)
# Generate content
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"]
if function_name == "all":
content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n")
content = content.lstrip()
# Check whether the model wants to generate another turn
if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
cleaned_completion_text = completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n").strip()
prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
else: else:
break break
# Break from loop if tool_choice/function_call is provided as a dict
else: else:
function_bodies.append(completion_text.strip()) function_bodies.append(completion_text.strip())
# Check whether the model wants to generate another turn
prompt += completion_text.strip()
grammar = None
completion = create_completion(stop=stops)
if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
prompt += "\n<|from|>assistant\n<|recipient|>"
else:
break break
assert "usage" in completion assert "usage" in completion
assert len(function_calls) > 0
assert len(function_calls) == len(function_bodies) assert len(function_calls) == len(function_bodies)
tool_calls = [] tool_calls = []
@ -1843,14 +2075,14 @@ def functionary_v1_v2_chat_handler(
"index": 0, "index": 0,
"message": { "message": {
"role": "assistant", "role": "assistant",
"content": None, "content": None if content == "" else content,
"function_call": { "function_call": {
"name": tool_calls[0]["function"]["name"], "name": tool_calls[0]["function"]["name"],
"arguments": tool_calls[0]["function"]["arguments"], "arguments": tool_calls[0]["function"]["arguments"],
} if len(tool_calls) > 0 else None,
"tool_calls": tool_calls if len(tool_calls) > 0 else None,
}, },
"tool_calls": tool_calls, "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
},
"finish_reason": "tool_calls",
} }
], ],
usage=completion["usage"], usage=completion["usage"],
@ -2199,181 +2431,6 @@ def chatml_function_calling(
stream=stream, stream=stream,
) )
def _convert_completion_to_chat_function(
tool_name: str,
completion_or_chunks: Union[
llama_types.CreateCompletionResponse,
Iterator[llama_types.CreateCompletionStreamResponse],
],
stream: bool,
):
if not stream:
completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore
assert "usage" in completion
tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
# TODO: Fix for legacy function calls
chat_completion: llama_types.CreateChatCompletionResponse = {
"id": "chat" + completion["id"],
"object": "chat.completion",
"created": completion["created"],
"model": completion["model"],
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": None,
"function_call": {
"name": tool_name,
"arguments": completion["choices"][0]["text"],
},
"tool_calls": [
{
"id": tool_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": completion["choices"][0]["text"],
},
}
],
},
"finish_reason": "tool_calls",
}
],
"usage": completion["usage"],
}
return chat_completion
else:
chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore
def _stream_response_to_function_stream(
chunks: Iterator[llama_types.CreateCompletionStreamResponse],
) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
# blank first message
first = True
id_ = None
created = None
model = None
tool_id = None
for chunk in chunks:
if first:
id_ = "chat" + chunk["id"]
created = chunk["created"]
model = chunk["model"]
tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
yield {
"id": id_,
"object": "chat.completion.chunk",
"created": created,
"model": model,
"choices": [
{
"index": 0,
"finish_reason": None,
"logprobs": None,
"delta": {
"role": "assistant",
"content": None,
"function_call": None,
"tool_calls": None,
},
}
],
}
yield {
"id": "chat" + chunk["id"],
"object": "chat.completion.chunk",
"created": chunk["created"],
"model": chunk["model"],
"choices": [
{
"index": 0,
"finish_reason": None,
"logprobs": None,
"delta": {
"role": None,
"content": None,
"function_call": {
"name": tool_name,
"arguments": chunk["choices"][0]["text"],
},
"tool_calls": [
{
"index": 0,
"id": tool_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": "",
},
}
],
},
}
],
}
first = False
continue
assert tool_id is not None
yield {
"id": "chat" + chunk["id"],
"object": "chat.completion.chunk",
"created": chunk["created"],
"model": chunk["model"],
"choices": [
{
"index": 0,
"finish_reason": None,
"logprobs": None,
"delta": {
"role": None,
"content": None,
"function_call": {
"name": tool_name,
"arguments": chunk["choices"][0]["text"],
},
"tool_calls": [
{
"index": 0,
"id": tool_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": chunk["choices"][0][
"text"
],
},
}
],
},
}
],
}
if id_ is not None and created is not None and model is not None:
yield {
"id": id_,
"object": "chat.completion.chunk",
"created": created,
"model": model,
"choices": [
{
"index": 0,
"finish_reason": "tool_calls",
"logprobs": None,
"delta": {
"role": None,
"content": None,
"function_call": None,
"tool_calls": None,
},
}
],
}
return _stream_response_to_function_stream(chunks)
# Case 2: Tool choice by user # Case 2: Tool choice by user
if isinstance(tool_choice, dict): if isinstance(tool_choice, dict):
tool_name = tool_choice["function"]["name"] tool_name = tool_choice["function"]["name"]

View file

@ -581,6 +581,7 @@ class llama_model_params(ctypes.Structure):
# bool embeddings; // if true, extract embeddings (together with logits) # bool embeddings; // if true, extract embeddings (together with logits)
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
# // Abort callback # // Abort callback
# // if it returns true, execution of llama_decode() will be aborted # // if it returns true, execution of llama_decode() will be aborted
# // currently works only with CPU execution # // currently works only with CPU execution
@ -669,6 +670,8 @@ It might not exist for progress report where '.' is output repeatedly."""
# typedef struct llama_model_quantize_params { # typedef struct llama_model_quantize_params {
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() # int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
# enum llama_ftype ftype; // quantize to this llama_ftype # enum llama_ftype ftype; // quantize to this llama_ftype
# enum ggml_type output_tensor_type; // output tensor type
# enum ggml_type token_embedding_type; // itoken embeddings tensor type
# bool allow_requantize; // allow quantizing non-f32/f16 tensors # bool allow_requantize; // allow quantizing non-f32/f16 tensors
# bool quantize_output_tensor; // quantize output.weight # bool quantize_output_tensor; // quantize output.weight
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@ -681,16 +684,20 @@ class llama_model_quantize_params(ctypes.Structure):
Attributes: Attributes:
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
ftype (int): quantize to this llama_ftype ftype (int): quantize to this llama_ftype
output_tensor_type (int): output tensor type
token_embedding_type (int): itoken embeddings tensor type
allow_requantize (bool): allow quantizing non-f32/f16 tensors allow_requantize (bool): allow quantizing non-f32/f16 tensors
quantize_output_tensor (bool): quantize output.weight quantize_output_tensor (bool): quantize output.weight
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
pure (bool): quantize all tensors to the default type pure (bool): quantize all tensors to the default type
imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data imatrix (ctypes.c_void_p): pointer to importance matrix data
""" """
_fields_ = [ _fields_ = [
("nthread", ctypes.c_int32), ("nthread", ctypes.c_int32),
("ftype", ctypes.c_int), ("ftype", ctypes.c_int),
("output_tensor_type", ctypes.c_int),
("token_embedding_type", ctypes.c_int),
("allow_requantize", ctypes.c_bool), ("allow_requantize", ctypes.c_bool),
("quantize_output_tensor", ctypes.c_bool), ("quantize_output_tensor", ctypes.c_bool),
("only_copy", ctypes.c_bool), ("only_copy", ctypes.c_bool),
@ -1006,6 +1013,11 @@ def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
def llama_n_embd(model: llama_model_p, /) -> int: ... def llama_n_embd(model: llama_model_p, /) -> int: ...
# LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
def llama_n_layer(model: llama_model_p, /) -> int: ...
# // Get the model's RoPE frequency scaling factor # // Get the model's RoPE frequency scaling factor
# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); # LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float) @ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
@ -1166,6 +1178,12 @@ def llama_model_quantize(
... ...
# // Apply a LoRA adapter to a loaded model
# // path_base_model is the path to a higher quality model to use as a base for
# // the layers modified by the adapter. Can be NULL to use the current loaded model.
# // The model needs to be reloaded before applying a new adapter, otherwise the adapter
# // will be applied on top of the previous one
# // Returns 0 on success
# LLAMA_API int32_t llama_model_apply_lora_from_file( # LLAMA_API int32_t llama_model_apply_lora_from_file(
# const struct llama_model * model, # const struct llama_model * model,
# const char * path_lora, # const char * path_lora,
@ -1190,7 +1208,57 @@ def llama_model_apply_lora_from_file(
path_base_model: Union[ctypes.c_char_p, bytes, None], path_base_model: Union[ctypes.c_char_p, bytes, None],
n_threads: Union[ctypes.c_int32, int], n_threads: Union[ctypes.c_int32, int],
/, /,
) -> int: ... ) -> int:
"""Apply a LoRA adapter to a loaded model
path_base_model is the path to a higher quality model to use as a base for
the layers modified by the adapter. Can be NULL to use the current loaded model.
The model needs to be reloaded before applying a new adapter, otherwise the adapter
will be applied on top of the previous one
Returns 0 on success"""
...
# // Apply a loaded control vector to a llama_context, or if data is NULL, clear
# // the currently loaded vector.
# // n_embd should be the size of a single layer's control, and data should point
# // to an n_embd x n_layers buffer starting from layer 1.
# // il_start and il_end are the layer range the vector should apply to (both inclusive)
# // See llama_control_vector_load in common to load a control vector.
# LLAMA_API int32_t llama_control_vector_apply(
# struct llama_context * lctx,
# const float * data,
# size_t len,
# int32_t n_embd,
# int32_t il_start,
# int32_t il_end);
@ctypes_function(
"llama_control_vector_apply",
[
llama_context_p_ctypes,
ctypes.POINTER(ctypes.c_float),
ctypes.c_size_t,
ctypes.c_int32,
ctypes.c_int32,
ctypes.c_int32,
],
ctypes.c_int32,
)
def llama_control_vector_apply(
lctx: llama_context_p,
data: CtypesPointerOrRef[ctypes.c_float],
len: int,
n_embd: int,
il_start: int,
il_end: int,
/,
) -> int:
"""Apply a loaded control vector to a llama_context, or if data is NULL, clear
the currently loaded vector.
n_embd should be the size of a single layer's control, and data should point
to an n_embd x n_layers buffer starting from layer 1.
il_start and il_end are the layer range the vector should apply to (both inclusive)
See llama_control_vector_load in common to load a control vector."""
...
# // # //
@ -1205,6 +1273,12 @@ def llama_model_apply_lora_from_file(
# llama_pos pos; # llama_pos pos;
# }; # };
class llama_kv_cache_view_cell(ctypes.Structure): class llama_kv_cache_view_cell(ctypes.Structure):
"""Information associated with an individual cell in the KV cache view.
Attributes:
pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
May be negative if the cell is not populated."""
_fields_ = [("pos", llama_pos)] _fields_ = [("pos", llama_pos)]
@ -1998,7 +2072,8 @@ def llama_tokenize(
Returns: Returns:
Returns the number of tokens on success, no more than n_tokens_max Returns the number of tokens on success, no more than n_tokens_max
Returns a negative number on failure - the number of tokens that would have been returned""" Returns a negative number on failure - the number of tokens that would have been returned
"""
... ...
@ -2674,6 +2749,48 @@ def llama_beam_search(
): ... ): ...
# /// @details Build a split GGUF final path for this chunk.
# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
# // Returns the split_path length.
# LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
@ctypes_function(
"llama_split_path",
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
ctypes.c_int,
)
def llama_split_path(
split_path: bytes,
maxlen: Union[ctypes.c_size_t, int],
path_prefix: bytes,
split_no: Union[ctypes.c_int, int],
split_count: Union[ctypes.c_int, int],
/,
) -> int:
"""Build a split GGUF final path for this chunk."""
...
# /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
# /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
# // Returns the split_prefix length.
# LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
@ctypes_function(
"llama_split_prefix",
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
ctypes.c_int,
)
def llama_split_prefix(
split_prefix: bytes,
maxlen: Union[ctypes.c_size_t, int],
split_path: bytes,
split_no: Union[ctypes.c_int, int],
split_count: Union[ctypes.c_int, int],
/,
) -> int:
"""Extract the path prefix from the split_path if and only if the split_no and split_count match."""
...
# Performance information # Performance information

View file

@ -12,14 +12,7 @@ import llama_cpp
import anyio import anyio
from anyio.streams.memory import MemoryObjectSendStream from anyio.streams.memory import MemoryObjectSendStream
from starlette.concurrency import run_in_threadpool, iterate_in_threadpool from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
from fastapi import ( from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body
Depends,
FastAPI,
APIRouter,
Request,
HTTPException,
status,
)
from fastapi.middleware import Middleware from fastapi.middleware import Middleware
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.security import HTTPBearer from fastapi.security import HTTPBearer
@ -356,7 +349,64 @@ async def create_embedding(
) )
async def create_chat_completion( async def create_chat_completion(
request: Request, request: Request,
body: CreateChatCompletionRequest, body: CreateChatCompletionRequest = Body(
openapi_examples={
"normal": {
"summary": "Chat Completion",
"value": {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
],
},
},
"json_mode": {
"summary": "JSON Mode",
"value": {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who won the world series in 2020"},
],
"response_format": { "type": "json_object" }
},
},
"tool_calling": {
"summary": "Tool Calling",
"value": {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Extract Jason is 30 years old."},
],
"tools": [
{
"type": "function",
"function": {
"name": "User",
"description": "User record",
"parameters": {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "number"},
},
"required": ["name", "age"],
},
}
}
],
"tool_choice": {
"type": "function",
"function": {
"name": "User",
}
}
},
},
}
),
llama_proxy: LlamaProxy = Depends(get_llama_proxy), llama_proxy: LlamaProxy = Depends(get_llama_proxy),
) -> llama_cpp.ChatCompletion: ) -> llama_cpp.ChatCompletion:
exclude = { exclude = {

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652