Compare commits
13 commits
4cb67f59d8
...
d3afd4507f
Author | SHA1 | Date | |
---|---|---|---|
d3afd4507f | |||
|
c1325dcdfb | ||
|
e325a831f0 | ||
|
c89be28ef9 | ||
|
3db03b7302 | ||
|
740f3f3812 | ||
|
f7decc9562 | ||
|
60d8498f21 | ||
|
18d7ce918f | ||
|
7d4a5ec59f | ||
|
bf64752535 | ||
|
8a60c7bc8c | ||
|
8d298b4750 |
7 changed files with 504 additions and 268 deletions
|
@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.2.57]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
|
||||
- fix: set default embedding pooling type to unspecified by @abetlen in 4084aabe867b8ec2aba1b22659e59c9318b0d1f3
|
||||
- fix: Fix and optimize functionary chat handler by @jeffrey-fong in #1282
|
||||
- fix: json mode for basic chat formats by @abetlen in 20e6815252d0efd9f015f7adbf108faaf36e3f3c
|
||||
|
||||
## [0.2.56]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e
|
||||
|
|
|
@ -17,6 +17,11 @@ if (LLAMA_BUILD)
|
|||
set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
|
||||
set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
|
||||
endif()
|
||||
|
||||
if (APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
|
||||
set(LLAMA_METAL_EMBED_LIBRARY "On" CACHE BOOL "llama: embed metal library" FORCE)
|
||||
endif()
|
||||
|
||||
add_subdirectory(vendor/llama.cpp)
|
||||
install(
|
||||
TARGETS llama
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from .llama_cpp import *
|
||||
from .llama import *
|
||||
|
||||
__version__ = "0.2.56"
|
||||
__version__ = "0.2.57"
|
|
@ -188,6 +188,10 @@ class Jinja2ChatFormatter(ChatFormatter):
|
|||
self,
|
||||
*,
|
||||
messages: List[llama_types.ChatCompletionRequestMessage],
|
||||
functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
|
||||
function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
|
||||
tools: Optional[List[llama_types.ChatCompletionTool]] = None,
|
||||
tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
|
||||
**kwargs: Any,
|
||||
) -> ChatFormatterResponse:
|
||||
def raise_exception(message: str):
|
||||
|
@ -199,6 +203,10 @@ class Jinja2ChatFormatter(ChatFormatter):
|
|||
bos_token=self.bos_token,
|
||||
raise_exception=raise_exception,
|
||||
add_generation_prompt=self.add_generation_prompt,
|
||||
functions=functions,
|
||||
function_call=function_call,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
)
|
||||
|
||||
return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token])
|
||||
|
@ -288,6 +296,183 @@ def _convert_completion_to_chat(
|
|||
return _convert_text_completion_to_chat(completion)
|
||||
|
||||
|
||||
def _convert_completion_to_chat_function(
|
||||
tool_name: str,
|
||||
completion_or_chunks: Union[
|
||||
llama_types.CreateCompletionResponse,
|
||||
Iterator[llama_types.CreateCompletionStreamResponse],
|
||||
],
|
||||
stream: bool,
|
||||
):
|
||||
if not stream:
|
||||
completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore
|
||||
assert "usage" in completion
|
||||
tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
|
||||
# TODO: Fix for legacy function calls
|
||||
chat_completion: llama_types.CreateChatCompletionResponse = {
|
||||
"id": "chat" + completion["id"],
|
||||
"object": "chat.completion",
|
||||
"created": completion["created"],
|
||||
"model": completion["model"],
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"function_call": {
|
||||
"name": tool_name,
|
||||
"arguments": completion["choices"][0]["text"],
|
||||
},
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": tool_id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tool_name,
|
||||
"arguments": completion["choices"][0]["text"],
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
"finish_reason": "tool_calls",
|
||||
}
|
||||
],
|
||||
"usage": completion["usage"],
|
||||
}
|
||||
return chat_completion
|
||||
else:
|
||||
chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore
|
||||
|
||||
def _stream_response_to_function_stream(
|
||||
chunks: Iterator[llama_types.CreateCompletionStreamResponse],
|
||||
) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
|
||||
# blank first message
|
||||
first = True
|
||||
id_ = None
|
||||
created = None
|
||||
model = None
|
||||
tool_id = None
|
||||
for chunk in chunks:
|
||||
if first:
|
||||
id_ = "chat" + chunk["id"]
|
||||
created = chunk["created"]
|
||||
model = chunk["model"]
|
||||
tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
|
||||
yield {
|
||||
"id": id_,
|
||||
"object": "chat.completion.chunk",
|
||||
"created": created,
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"finish_reason": None,
|
||||
"logprobs": None,
|
||||
"delta": {
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"function_call": None,
|
||||
"tool_calls": None,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
yield {
|
||||
"id": "chat" + chunk["id"],
|
||||
"object": "chat.completion.chunk",
|
||||
"created": chunk["created"],
|
||||
"model": chunk["model"],
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"finish_reason": None,
|
||||
"logprobs": None,
|
||||
"delta": {
|
||||
"role": None,
|
||||
"content": None,
|
||||
"function_call": {
|
||||
"name": tool_name,
|
||||
"arguments": chunk["choices"][0]["text"],
|
||||
},
|
||||
"tool_calls": [
|
||||
{
|
||||
"index": 0,
|
||||
"id": tool_id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tool_name,
|
||||
"arguments": chunk["choices"][0]["text"],
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
first = False
|
||||
continue
|
||||
assert tool_id is not None
|
||||
yield {
|
||||
"id": "chat" + chunk["id"],
|
||||
"object": "chat.completion.chunk",
|
||||
"created": chunk["created"],
|
||||
"model": chunk["model"],
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"finish_reason": None,
|
||||
"logprobs": None,
|
||||
"delta": {
|
||||
"role": None,
|
||||
"content": None,
|
||||
"function_call": {
|
||||
"name": tool_name,
|
||||
"arguments": chunk["choices"][0]["text"],
|
||||
},
|
||||
"tool_calls": [
|
||||
{
|
||||
"index": 0,
|
||||
"id": tool_id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tool_name,
|
||||
"arguments": chunk["choices"][0][
|
||||
"text"
|
||||
],
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
if id_ is not None and created is not None and model is not None:
|
||||
yield {
|
||||
"id": id_,
|
||||
"object": "chat.completion.chunk",
|
||||
"created": created,
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"finish_reason": "tool_calls",
|
||||
"logprobs": None,
|
||||
"delta": {
|
||||
"role": None,
|
||||
"content": None,
|
||||
"function_call": None,
|
||||
"tool_calls": None,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
return _stream_response_to_function_stream(chunks)
|
||||
|
||||
|
||||
|
||||
def chat_formatter_to_chat_completion_handler(
|
||||
chat_formatter: ChatFormatter,
|
||||
) -> LlamaChatCompletionHandler:
|
||||
|
@ -331,6 +516,8 @@ def chat_formatter_to_chat_completion_handler(
|
|||
messages=messages,
|
||||
functions=functions,
|
||||
function_call=function_call,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
)
|
||||
prompt = result.prompt
|
||||
if result.stop is not None:
|
||||
|
@ -341,6 +528,47 @@ def chat_formatter_to_chat_completion_handler(
|
|||
if response_format is not None and response_format["type"] == "json_object":
|
||||
grammar = _grammar_for_response_format(response_format, verbose=llama.verbose)
|
||||
|
||||
# Convert legacy functions to tools
|
||||
if functions is not None:
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": function,
|
||||
}
|
||||
for function in functions
|
||||
]
|
||||
|
||||
# Convert legacy function_call to tool_choice
|
||||
if function_call is not None:
|
||||
if isinstance(function_call, str) and (
|
||||
function_call == "none" or function_call == "auto"
|
||||
):
|
||||
tool_choice = function_call
|
||||
if isinstance(function_call, dict) and "name" in function_call:
|
||||
tool_choice = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": function_call["name"],
|
||||
},
|
||||
}
|
||||
|
||||
tool = None
|
||||
if tool_choice is not None and isinstance(tool_choice, dict) and tools is not None:
|
||||
name = tool_choice["function"]["name"]
|
||||
tool = next((t for t in tools if t["function"]["name"] == name), None)
|
||||
if tool is None:
|
||||
raise ValueError(f"Tool choice '{name}' not found in tools.")
|
||||
schema = tool["function"]["parameters"]
|
||||
try:
|
||||
# create grammar from json schema
|
||||
grammar = llama_grammar.LlamaGrammar.from_json_schema(
|
||||
json.dumps(schema), verbose=llama.verbose
|
||||
)
|
||||
except Exception as e:
|
||||
grammar = llama_grammar.LlamaGrammar.from_string(
|
||||
llama_grammar.JSON_GBNF, verbose=llama.verbose
|
||||
)
|
||||
|
||||
completion_or_chunks = llama.create_completion(
|
||||
prompt=prompt,
|
||||
temperature=temperature,
|
||||
|
@ -364,6 +592,11 @@ def chat_formatter_to_chat_completion_handler(
|
|||
grammar=grammar,
|
||||
logit_bias=logit_bias,
|
||||
)
|
||||
if tool is not None:
|
||||
tool_name = tool["function"]["name"]
|
||||
return _convert_completion_to_chat_function(
|
||||
tool_name, completion_or_chunks, stream
|
||||
)
|
||||
return _convert_completion_to_chat(completion_or_chunks, stream=stream)
|
||||
|
||||
return chat_completion_handler
|
||||
|
@ -1596,13 +1829,15 @@ def functionary_v1_v2_chat_handler(
|
|||
function_call = (
|
||||
tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
|
||||
)
|
||||
else:
|
||||
function_call = "auto"
|
||||
|
||||
prompt = prepare_messages_for_inference(
|
||||
messages, tokenizer, version, functions, tools
|
||||
)
|
||||
|
||||
# If no tools/functions are provided
|
||||
if function_call is None and (functions is None or len(functions) == 0):
|
||||
if function_call == "none" or functions is None or len(functions) == 0:
|
||||
if version == "v1":
|
||||
stop = END_ASSISTANT_TOKEN
|
||||
else:
|
||||
|
@ -1630,6 +1865,7 @@ def functionary_v1_v2_chat_handler(
|
|||
logits_processor=logits_processor,
|
||||
grammar=grammar,
|
||||
)
|
||||
completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
|
||||
return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore
|
||||
|
||||
assert stream is False # TODO: support stream mode
|
||||
|
@ -1692,13 +1928,12 @@ def functionary_v1_v2_chat_handler(
|
|||
|
||||
return completion
|
||||
|
||||
content = ""
|
||||
function_calls, function_bodies = [], []
|
||||
|
||||
if version == "v1":
|
||||
# If no or "auto" tool_choice/function_call
|
||||
if function_call is None or (
|
||||
isinstance(function_call, str) and function_call == "auto"
|
||||
):
|
||||
if isinstance(function_call, str) and function_call == "auto":
|
||||
stops = ["\n", END_ASSISTANT_TOKEN]
|
||||
# If tool_choice/function_call is "none"
|
||||
elif isinstance(function_call, str) and function_call == "none":
|
||||
|
@ -1747,70 +1982,67 @@ def functionary_v1_v2_chat_handler(
|
|||
else:
|
||||
function_bodies.append(completion_text.strip())
|
||||
else:
|
||||
# Loop until all parallel function calls are generated
|
||||
while True:
|
||||
# If no or "auto" tool_choice/function_call
|
||||
if function_call is None or (
|
||||
isinstance(function_call, str) and function_call == "auto"
|
||||
):
|
||||
grammar = None
|
||||
stops = CONTENT_TOKEN
|
||||
# If tool_choice/function_call is "none"
|
||||
elif isinstance(function_call, str) and function_call == "none":
|
||||
prompt = (
|
||||
prepare_messages_for_inference(messages, tokenizer, version, [], [])
|
||||
+ "all\n<|content|>"
|
||||
)
|
||||
stops = STOP_TOKEN
|
||||
# If tool_choice/function_call is provided
|
||||
elif isinstance(function_call, dict):
|
||||
prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
|
||||
stops = STOP_TOKEN
|
||||
function_call = function_call["name"]
|
||||
function_calls.append(function_call)
|
||||
grammar = get_grammar(function_call)
|
||||
else:
|
||||
prompt = prompt
|
||||
stops = STOP_TOKEN
|
||||
|
||||
# If tool_choice/function_call is "none"
|
||||
if isinstance(function_call, str) and function_call == "none":
|
||||
prompt = (
|
||||
prepare_messages_for_inference(messages, tokenizer, version, [], [])
|
||||
+ "all\n<|content|>"
|
||||
)
|
||||
stops = [STOP_TOKEN, FROM_TOKEN]
|
||||
completion = create_completion(stop=stops)
|
||||
completion["choices"][0]["text"] = completion["choices"][0]["text"].strip()
|
||||
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
|
||||
# If tool_choice/function_call is provided
|
||||
elif isinstance(function_call, dict):
|
||||
prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
|
||||
function_call = function_call["name"]
|
||||
function_calls.append(function_call)
|
||||
grammar = get_grammar(function_call)
|
||||
stops = [STOP_TOKEN, FROM_TOKEN]
|
||||
completion = create_completion(stop=stops)
|
||||
completion_text = completion["choices"][0]["text"]
|
||||
|
||||
# If the generation does not involve a function call
|
||||
if prompt.endswith("all\n<|content|>") and not completion_text.startswith(
|
||||
"all"
|
||||
):
|
||||
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
|
||||
# Generate model response if the model decides not to call any function
|
||||
elif prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all"):
|
||||
prompt += completion_text + CONTENT_TOKEN
|
||||
completion = create_completion(stop=STOP_TOKEN)
|
||||
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
|
||||
# Generate parameters if model decides to call a function
|
||||
elif prompt.endswith(RECIPIENT_TOKEN):
|
||||
function_calls.append(completion_text[:-1])
|
||||
grammar = get_grammar(function_calls[-1])
|
||||
completion = create_completion(stop=[STOP_TOKEN, "\n"])
|
||||
function_bodies.append(completion["choices"][0]["text"].strip())
|
||||
prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}"
|
||||
function_bodies.append(completion_text.strip())
|
||||
# If "auto" or no tool_choice/function_call
|
||||
elif isinstance(function_call, str) and function_call == "auto":
|
||||
while True:
|
||||
# Generate function name first
|
||||
grammar = None
|
||||
|
||||
# Try to generate the beginning of next turn
|
||||
# If empty completion, break from loop
|
||||
next_turn_completion_text = create_completion(
|
||||
stop=[STOP_TOKEN, RECIPIENT_TOKEN]
|
||||
)["choices"][0]["text"]
|
||||
if len(next_turn_completion_text) > 0:
|
||||
prompt += f"\n{FROM_TOKEN}assistant\n{RECIPIENT_TOKEN}"
|
||||
stops = CONTENT_TOKEN
|
||||
completion = create_completion(stop=stops)
|
||||
completion_text = completion["choices"][0]["text"]
|
||||
function_name = completion_text.strip()
|
||||
if function_name == "all":
|
||||
prompt += "all\n<|content|>"
|
||||
else:
|
||||
break
|
||||
# Break from loop if tool_choice/function_call is provided as a dict
|
||||
else:
|
||||
function_bodies.append(completion_text.strip())
|
||||
break
|
||||
function_call = completion_text.strip()
|
||||
prompt += f"{function_call}\n<|content|>"
|
||||
function_calls.append(function_call)
|
||||
grammar = get_grammar(function_call)
|
||||
# Generate content
|
||||
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
|
||||
completion = create_completion(stop=stops)
|
||||
completion_text = completion["choices"][0]["text"]
|
||||
if function_name == "all":
|
||||
content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n")
|
||||
content = content.lstrip()
|
||||
# Check whether the model wants to generate another turn
|
||||
if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
|
||||
cleaned_completion_text = completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n").strip()
|
||||
prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
|
||||
else:
|
||||
break
|
||||
else:
|
||||
function_bodies.append(completion_text.strip())
|
||||
# Check whether the model wants to generate another turn
|
||||
prompt += completion_text.strip()
|
||||
grammar = None
|
||||
completion = create_completion(stop=stops)
|
||||
if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
|
||||
prompt += "\n<|from|>assistant\n<|recipient|>"
|
||||
else:
|
||||
break
|
||||
|
||||
assert "usage" in completion
|
||||
assert len(function_calls) > 0
|
||||
assert len(function_calls) == len(function_bodies)
|
||||
|
||||
tool_calls = []
|
||||
|
@ -1843,14 +2075,14 @@ def functionary_v1_v2_chat_handler(
|
|||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"content": None if content == "" else content,
|
||||
"function_call": {
|
||||
"name": tool_calls[0]["function"]["name"],
|
||||
"arguments": tool_calls[0]["function"]["arguments"],
|
||||
},
|
||||
"tool_calls": tool_calls,
|
||||
} if len(tool_calls) > 0 else None,
|
||||
"tool_calls": tool_calls if len(tool_calls) > 0 else None,
|
||||
},
|
||||
"finish_reason": "tool_calls",
|
||||
"finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
|
||||
}
|
||||
],
|
||||
usage=completion["usage"],
|
||||
|
@ -2199,181 +2431,6 @@ def chatml_function_calling(
|
|||
stream=stream,
|
||||
)
|
||||
|
||||
def _convert_completion_to_chat_function(
|
||||
tool_name: str,
|
||||
completion_or_chunks: Union[
|
||||
llama_types.CreateCompletionResponse,
|
||||
Iterator[llama_types.CreateCompletionStreamResponse],
|
||||
],
|
||||
stream: bool,
|
||||
):
|
||||
if not stream:
|
||||
completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore
|
||||
assert "usage" in completion
|
||||
tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
|
||||
# TODO: Fix for legacy function calls
|
||||
chat_completion: llama_types.CreateChatCompletionResponse = {
|
||||
"id": "chat" + completion["id"],
|
||||
"object": "chat.completion",
|
||||
"created": completion["created"],
|
||||
"model": completion["model"],
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"function_call": {
|
||||
"name": tool_name,
|
||||
"arguments": completion["choices"][0]["text"],
|
||||
},
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": tool_id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tool_name,
|
||||
"arguments": completion["choices"][0]["text"],
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
"finish_reason": "tool_calls",
|
||||
}
|
||||
],
|
||||
"usage": completion["usage"],
|
||||
}
|
||||
return chat_completion
|
||||
else:
|
||||
chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore
|
||||
|
||||
def _stream_response_to_function_stream(
|
||||
chunks: Iterator[llama_types.CreateCompletionStreamResponse],
|
||||
) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
|
||||
# blank first message
|
||||
first = True
|
||||
id_ = None
|
||||
created = None
|
||||
model = None
|
||||
tool_id = None
|
||||
for chunk in chunks:
|
||||
if first:
|
||||
id_ = "chat" + chunk["id"]
|
||||
created = chunk["created"]
|
||||
model = chunk["model"]
|
||||
tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
|
||||
yield {
|
||||
"id": id_,
|
||||
"object": "chat.completion.chunk",
|
||||
"created": created,
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"finish_reason": None,
|
||||
"logprobs": None,
|
||||
"delta": {
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"function_call": None,
|
||||
"tool_calls": None,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
yield {
|
||||
"id": "chat" + chunk["id"],
|
||||
"object": "chat.completion.chunk",
|
||||
"created": chunk["created"],
|
||||
"model": chunk["model"],
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"finish_reason": None,
|
||||
"logprobs": None,
|
||||
"delta": {
|
||||
"role": None,
|
||||
"content": None,
|
||||
"function_call": {
|
||||
"name": tool_name,
|
||||
"arguments": chunk["choices"][0]["text"],
|
||||
},
|
||||
"tool_calls": [
|
||||
{
|
||||
"index": 0,
|
||||
"id": tool_id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tool_name,
|
||||
"arguments": "",
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
first = False
|
||||
continue
|
||||
assert tool_id is not None
|
||||
yield {
|
||||
"id": "chat" + chunk["id"],
|
||||
"object": "chat.completion.chunk",
|
||||
"created": chunk["created"],
|
||||
"model": chunk["model"],
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"finish_reason": None,
|
||||
"logprobs": None,
|
||||
"delta": {
|
||||
"role": None,
|
||||
"content": None,
|
||||
"function_call": {
|
||||
"name": tool_name,
|
||||
"arguments": chunk["choices"][0]["text"],
|
||||
},
|
||||
"tool_calls": [
|
||||
{
|
||||
"index": 0,
|
||||
"id": tool_id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tool_name,
|
||||
"arguments": chunk["choices"][0][
|
||||
"text"
|
||||
],
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
if id_ is not None and created is not None and model is not None:
|
||||
yield {
|
||||
"id": id_,
|
||||
"object": "chat.completion.chunk",
|
||||
"created": created,
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"finish_reason": "tool_calls",
|
||||
"logprobs": None,
|
||||
"delta": {
|
||||
"role": None,
|
||||
"content": None,
|
||||
"function_call": None,
|
||||
"tool_calls": None,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
return _stream_response_to_function_stream(chunks)
|
||||
|
||||
# Case 2: Tool choice by user
|
||||
if isinstance(tool_choice, dict):
|
||||
tool_name = tool_choice["function"]["name"]
|
||||
|
|
|
@ -581,6 +581,7 @@ class llama_model_params(ctypes.Structure):
|
|||
# bool embeddings; // if true, extract embeddings (together with logits)
|
||||
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
|
||||
|
||||
# // Abort callback
|
||||
# // if it returns true, execution of llama_decode() will be aborted
|
||||
# // currently works only with CPU execution
|
||||
|
@ -667,13 +668,15 @@ It might not exist for progress report where '.' is output repeatedly."""
|
|||
|
||||
# // model quantization parameters
|
||||
# typedef struct llama_model_quantize_params {
|
||||
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
# enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
# bool quantize_output_tensor; // quantize output.weight
|
||||
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
# bool pure; // quantize all tensors to the default type
|
||||
# void * imatrix; // pointer to importance matrix data
|
||||
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
# enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
# enum ggml_type output_tensor_type; // output tensor type
|
||||
# enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
||||
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
# bool quantize_output_tensor; // quantize output.weight
|
||||
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
# bool pure; // quantize all tensors to the default type
|
||||
# void * imatrix; // pointer to importance matrix data
|
||||
# } llama_model_quantize_params;
|
||||
class llama_model_quantize_params(ctypes.Structure):
|
||||
"""Parameters for llama_model_quantize
|
||||
|
@ -681,16 +684,20 @@ class llama_model_quantize_params(ctypes.Structure):
|
|||
Attributes:
|
||||
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
ftype (int): quantize to this llama_ftype
|
||||
output_tensor_type (int): output tensor type
|
||||
token_embedding_type (int): itoken embeddings tensor type
|
||||
allow_requantize (bool): allow quantizing non-f32/f16 tensors
|
||||
quantize_output_tensor (bool): quantize output.weight
|
||||
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
pure (bool): quantize all tensors to the default type
|
||||
imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data
|
||||
imatrix (ctypes.c_void_p): pointer to importance matrix data
|
||||
"""
|
||||
|
||||
_fields_ = [
|
||||
("nthread", ctypes.c_int32),
|
||||
("ftype", ctypes.c_int),
|
||||
("output_tensor_type", ctypes.c_int),
|
||||
("token_embedding_type", ctypes.c_int),
|
||||
("allow_requantize", ctypes.c_bool),
|
||||
("quantize_output_tensor", ctypes.c_bool),
|
||||
("only_copy", ctypes.c_bool),
|
||||
|
@ -1006,6 +1013,11 @@ def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
|
|||
def llama_n_embd(model: llama_model_p, /) -> int: ...
|
||||
|
||||
|
||||
# LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
||||
@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
|
||||
def llama_n_layer(model: llama_model_p, /) -> int: ...
|
||||
|
||||
|
||||
# // Get the model's RoPE frequency scaling factor
|
||||
# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||
@ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
|
||||
|
@ -1166,12 +1178,18 @@ def llama_model_quantize(
|
|||
...
|
||||
|
||||
|
||||
# // Apply a LoRA adapter to a loaded model
|
||||
# // path_base_model is the path to a higher quality model to use as a base for
|
||||
# // the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
# // The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
# // will be applied on top of the previous one
|
||||
# // Returns 0 on success
|
||||
# LLAMA_API int32_t llama_model_apply_lora_from_file(
|
||||
# const struct llama_model * model,
|
||||
# const char * path_lora,
|
||||
# float scale,
|
||||
# const char * path_base_model,
|
||||
# int32_t n_threads);
|
||||
# const char * path_lora,
|
||||
# float scale,
|
||||
# const char * path_base_model,
|
||||
# int32_t n_threads);
|
||||
@ctypes_function(
|
||||
"llama_model_apply_lora_from_file",
|
||||
[
|
||||
|
@ -1190,7 +1208,57 @@ def llama_model_apply_lora_from_file(
|
|||
path_base_model: Union[ctypes.c_char_p, bytes, None],
|
||||
n_threads: Union[ctypes.c_int32, int],
|
||||
/,
|
||||
) -> int: ...
|
||||
) -> int:
|
||||
"""Apply a LoRA adapter to a loaded model
|
||||
path_base_model is the path to a higher quality model to use as a base for
|
||||
the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
will be applied on top of the previous one
|
||||
Returns 0 on success"""
|
||||
...
|
||||
|
||||
|
||||
# // Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||
# // the currently loaded vector.
|
||||
# // n_embd should be the size of a single layer's control, and data should point
|
||||
# // to an n_embd x n_layers buffer starting from layer 1.
|
||||
# // il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||
# // See llama_control_vector_load in common to load a control vector.
|
||||
# LLAMA_API int32_t llama_control_vector_apply(
|
||||
# struct llama_context * lctx,
|
||||
# const float * data,
|
||||
# size_t len,
|
||||
# int32_t n_embd,
|
||||
# int32_t il_start,
|
||||
# int32_t il_end);
|
||||
@ctypes_function(
|
||||
"llama_control_vector_apply",
|
||||
[
|
||||
llama_context_p_ctypes,
|
||||
ctypes.POINTER(ctypes.c_float),
|
||||
ctypes.c_size_t,
|
||||
ctypes.c_int32,
|
||||
ctypes.c_int32,
|
||||
ctypes.c_int32,
|
||||
],
|
||||
ctypes.c_int32,
|
||||
)
|
||||
def llama_control_vector_apply(
|
||||
lctx: llama_context_p,
|
||||
data: CtypesPointerOrRef[ctypes.c_float],
|
||||
len: int,
|
||||
n_embd: int,
|
||||
il_start: int,
|
||||
il_end: int,
|
||||
/,
|
||||
) -> int:
|
||||
"""Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||
the currently loaded vector.
|
||||
n_embd should be the size of a single layer's control, and data should point
|
||||
to an n_embd x n_layers buffer starting from layer 1.
|
||||
il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||
See llama_control_vector_load in common to load a control vector."""
|
||||
...
|
||||
|
||||
|
||||
# //
|
||||
|
@ -1205,6 +1273,12 @@ def llama_model_apply_lora_from_file(
|
|||
# llama_pos pos;
|
||||
# };
|
||||
class llama_kv_cache_view_cell(ctypes.Structure):
|
||||
"""Information associated with an individual cell in the KV cache view.
|
||||
|
||||
Attributes:
|
||||
pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
|
||||
May be negative if the cell is not populated."""
|
||||
|
||||
_fields_ = [("pos", llama_pos)]
|
||||
|
||||
|
||||
|
@ -1998,7 +2072,8 @@ def llama_tokenize(
|
|||
|
||||
Returns:
|
||||
Returns the number of tokens on success, no more than n_tokens_max
|
||||
Returns a negative number on failure - the number of tokens that would have been returned"""
|
||||
Returns a negative number on failure - the number of tokens that would have been returned
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
|
@ -2674,6 +2749,48 @@ def llama_beam_search(
|
|||
): ...
|
||||
|
||||
|
||||
# /// @details Build a split GGUF final path for this chunk.
|
||||
# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
||||
# // Returns the split_path length.
|
||||
# LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
||||
@ctypes_function(
|
||||
"llama_split_path",
|
||||
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
|
||||
ctypes.c_int,
|
||||
)
|
||||
def llama_split_path(
|
||||
split_path: bytes,
|
||||
maxlen: Union[ctypes.c_size_t, int],
|
||||
path_prefix: bytes,
|
||||
split_no: Union[ctypes.c_int, int],
|
||||
split_count: Union[ctypes.c_int, int],
|
||||
/,
|
||||
) -> int:
|
||||
"""Build a split GGUF final path for this chunk."""
|
||||
...
|
||||
|
||||
|
||||
# /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
||||
# /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
||||
# // Returns the split_prefix length.
|
||||
# LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
||||
@ctypes_function(
|
||||
"llama_split_prefix",
|
||||
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
|
||||
ctypes.c_int,
|
||||
)
|
||||
def llama_split_prefix(
|
||||
split_prefix: bytes,
|
||||
maxlen: Union[ctypes.c_size_t, int],
|
||||
split_path: bytes,
|
||||
split_no: Union[ctypes.c_int, int],
|
||||
split_count: Union[ctypes.c_int, int],
|
||||
/,
|
||||
) -> int:
|
||||
"""Extract the path prefix from the split_path if and only if the split_no and split_count match."""
|
||||
...
|
||||
|
||||
|
||||
# Performance information
|
||||
|
||||
|
||||
|
|
|
@ -12,14 +12,7 @@ import llama_cpp
|
|||
import anyio
|
||||
from anyio.streams.memory import MemoryObjectSendStream
|
||||
from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
|
||||
from fastapi import (
|
||||
Depends,
|
||||
FastAPI,
|
||||
APIRouter,
|
||||
Request,
|
||||
HTTPException,
|
||||
status,
|
||||
)
|
||||
from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body
|
||||
from fastapi.middleware import Middleware
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.security import HTTPBearer
|
||||
|
@ -356,7 +349,64 @@ async def create_embedding(
|
|||
)
|
||||
async def create_chat_completion(
|
||||
request: Request,
|
||||
body: CreateChatCompletionRequest,
|
||||
body: CreateChatCompletionRequest = Body(
|
||||
openapi_examples={
|
||||
"normal": {
|
||||
"summary": "Chat Completion",
|
||||
"value": {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is the capital of France?"},
|
||||
],
|
||||
},
|
||||
},
|
||||
"json_mode": {
|
||||
"summary": "JSON Mode",
|
||||
"value": {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Who won the world series in 2020"},
|
||||
],
|
||||
"response_format": { "type": "json_object" }
|
||||
},
|
||||
},
|
||||
"tool_calling": {
|
||||
"summary": "Tool Calling",
|
||||
"value": {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Extract Jason is 30 years old."},
|
||||
],
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "User",
|
||||
"description": "User record",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"age": {"type": "number"},
|
||||
},
|
||||
"required": ["name", "age"],
|
||||
},
|
||||
}
|
||||
}
|
||||
],
|
||||
"tool_choice": {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "User",
|
||||
}
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
),
|
||||
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
|
||||
) -> llama_cpp.ChatCompletion:
|
||||
exclude = {
|
||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
|
||||
Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652
|
Loading…
Reference in a new issue