Compare commits
13 commits
4cb67f59d8
...
d3afd4507f
Author | SHA1 | Date | |
---|---|---|---|
d3afd4507f | |||
|
c1325dcdfb | ||
|
e325a831f0 | ||
|
c89be28ef9 | ||
|
3db03b7302 | ||
|
740f3f3812 | ||
|
f7decc9562 | ||
|
60d8498f21 | ||
|
18d7ce918f | ||
|
7d4a5ec59f | ||
|
bf64752535 | ||
|
8a60c7bc8c | ||
|
8d298b4750 |
7 changed files with 504 additions and 268 deletions
|
@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.2.57]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
|
||||||
|
- fix: set default embedding pooling type to unspecified by @abetlen in 4084aabe867b8ec2aba1b22659e59c9318b0d1f3
|
||||||
|
- fix: Fix and optimize functionary chat handler by @jeffrey-fong in #1282
|
||||||
|
- fix: json mode for basic chat formats by @abetlen in 20e6815252d0efd9f015f7adbf108faaf36e3f3c
|
||||||
|
|
||||||
## [0.2.56]
|
## [0.2.56]
|
||||||
|
|
||||||
- feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e
|
- feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e
|
||||||
|
|
|
@ -17,6 +17,11 @@ if (LLAMA_BUILD)
|
||||||
set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
|
set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
|
||||||
set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
|
set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
|
||||||
|
set(LLAMA_METAL_EMBED_LIBRARY "On" CACHE BOOL "llama: embed metal library" FORCE)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_subdirectory(vendor/llama.cpp)
|
add_subdirectory(vendor/llama.cpp)
|
||||||
install(
|
install(
|
||||||
TARGETS llama
|
TARGETS llama
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .llama_cpp import *
|
from .llama_cpp import *
|
||||||
from .llama import *
|
from .llama import *
|
||||||
|
|
||||||
__version__ = "0.2.56"
|
__version__ = "0.2.57"
|
|
@ -188,6 +188,10 @@ class Jinja2ChatFormatter(ChatFormatter):
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
messages: List[llama_types.ChatCompletionRequestMessage],
|
messages: List[llama_types.ChatCompletionRequestMessage],
|
||||||
|
functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
|
||||||
|
function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
|
||||||
|
tools: Optional[List[llama_types.ChatCompletionTool]] = None,
|
||||||
|
tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> ChatFormatterResponse:
|
) -> ChatFormatterResponse:
|
||||||
def raise_exception(message: str):
|
def raise_exception(message: str):
|
||||||
|
@ -199,6 +203,10 @@ class Jinja2ChatFormatter(ChatFormatter):
|
||||||
bos_token=self.bos_token,
|
bos_token=self.bos_token,
|
||||||
raise_exception=raise_exception,
|
raise_exception=raise_exception,
|
||||||
add_generation_prompt=self.add_generation_prompt,
|
add_generation_prompt=self.add_generation_prompt,
|
||||||
|
functions=functions,
|
||||||
|
function_call=function_call,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice=tool_choice,
|
||||||
)
|
)
|
||||||
|
|
||||||
return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token])
|
return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token])
|
||||||
|
@ -288,6 +296,183 @@ def _convert_completion_to_chat(
|
||||||
return _convert_text_completion_to_chat(completion)
|
return _convert_text_completion_to_chat(completion)
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_completion_to_chat_function(
|
||||||
|
tool_name: str,
|
||||||
|
completion_or_chunks: Union[
|
||||||
|
llama_types.CreateCompletionResponse,
|
||||||
|
Iterator[llama_types.CreateCompletionStreamResponse],
|
||||||
|
],
|
||||||
|
stream: bool,
|
||||||
|
):
|
||||||
|
if not stream:
|
||||||
|
completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore
|
||||||
|
assert "usage" in completion
|
||||||
|
tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
|
||||||
|
# TODO: Fix for legacy function calls
|
||||||
|
chat_completion: llama_types.CreateChatCompletionResponse = {
|
||||||
|
"id": "chat" + completion["id"],
|
||||||
|
"object": "chat.completion",
|
||||||
|
"created": completion["created"],
|
||||||
|
"model": completion["model"],
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": None,
|
||||||
|
"function_call": {
|
||||||
|
"name": tool_name,
|
||||||
|
"arguments": completion["choices"][0]["text"],
|
||||||
|
},
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"id": tool_id,
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": tool_name,
|
||||||
|
"arguments": completion["choices"][0]["text"],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"finish_reason": "tool_calls",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"usage": completion["usage"],
|
||||||
|
}
|
||||||
|
return chat_completion
|
||||||
|
else:
|
||||||
|
chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore
|
||||||
|
|
||||||
|
def _stream_response_to_function_stream(
|
||||||
|
chunks: Iterator[llama_types.CreateCompletionStreamResponse],
|
||||||
|
) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
|
||||||
|
# blank first message
|
||||||
|
first = True
|
||||||
|
id_ = None
|
||||||
|
created = None
|
||||||
|
model = None
|
||||||
|
tool_id = None
|
||||||
|
for chunk in chunks:
|
||||||
|
if first:
|
||||||
|
id_ = "chat" + chunk["id"]
|
||||||
|
created = chunk["created"]
|
||||||
|
model = chunk["model"]
|
||||||
|
tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
|
||||||
|
yield {
|
||||||
|
"id": id_,
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"created": created,
|
||||||
|
"model": model,
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"finish_reason": None,
|
||||||
|
"logprobs": None,
|
||||||
|
"delta": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": None,
|
||||||
|
"function_call": None,
|
||||||
|
"tool_calls": None,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
yield {
|
||||||
|
"id": "chat" + chunk["id"],
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"created": chunk["created"],
|
||||||
|
"model": chunk["model"],
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"finish_reason": None,
|
||||||
|
"logprobs": None,
|
||||||
|
"delta": {
|
||||||
|
"role": None,
|
||||||
|
"content": None,
|
||||||
|
"function_call": {
|
||||||
|
"name": tool_name,
|
||||||
|
"arguments": chunk["choices"][0]["text"],
|
||||||
|
},
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"id": tool_id,
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": tool_name,
|
||||||
|
"arguments": chunk["choices"][0]["text"],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
first = False
|
||||||
|
continue
|
||||||
|
assert tool_id is not None
|
||||||
|
yield {
|
||||||
|
"id": "chat" + chunk["id"],
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"created": chunk["created"],
|
||||||
|
"model": chunk["model"],
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"finish_reason": None,
|
||||||
|
"logprobs": None,
|
||||||
|
"delta": {
|
||||||
|
"role": None,
|
||||||
|
"content": None,
|
||||||
|
"function_call": {
|
||||||
|
"name": tool_name,
|
||||||
|
"arguments": chunk["choices"][0]["text"],
|
||||||
|
},
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"id": tool_id,
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": tool_name,
|
||||||
|
"arguments": chunk["choices"][0][
|
||||||
|
"text"
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
if id_ is not None and created is not None and model is not None:
|
||||||
|
yield {
|
||||||
|
"id": id_,
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"created": created,
|
||||||
|
"model": model,
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"finish_reason": "tool_calls",
|
||||||
|
"logprobs": None,
|
||||||
|
"delta": {
|
||||||
|
"role": None,
|
||||||
|
"content": None,
|
||||||
|
"function_call": None,
|
||||||
|
"tool_calls": None,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
return _stream_response_to_function_stream(chunks)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def chat_formatter_to_chat_completion_handler(
|
def chat_formatter_to_chat_completion_handler(
|
||||||
chat_formatter: ChatFormatter,
|
chat_formatter: ChatFormatter,
|
||||||
) -> LlamaChatCompletionHandler:
|
) -> LlamaChatCompletionHandler:
|
||||||
|
@ -331,6 +516,8 @@ def chat_formatter_to_chat_completion_handler(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
functions=functions,
|
functions=functions,
|
||||||
function_call=function_call,
|
function_call=function_call,
|
||||||
|
tools=tools,
|
||||||
|
tool_choice=tool_choice,
|
||||||
)
|
)
|
||||||
prompt = result.prompt
|
prompt = result.prompt
|
||||||
if result.stop is not None:
|
if result.stop is not None:
|
||||||
|
@ -341,6 +528,47 @@ def chat_formatter_to_chat_completion_handler(
|
||||||
if response_format is not None and response_format["type"] == "json_object":
|
if response_format is not None and response_format["type"] == "json_object":
|
||||||
grammar = _grammar_for_response_format(response_format, verbose=llama.verbose)
|
grammar = _grammar_for_response_format(response_format, verbose=llama.verbose)
|
||||||
|
|
||||||
|
# Convert legacy functions to tools
|
||||||
|
if functions is not None:
|
||||||
|
tools = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": function,
|
||||||
|
}
|
||||||
|
for function in functions
|
||||||
|
]
|
||||||
|
|
||||||
|
# Convert legacy function_call to tool_choice
|
||||||
|
if function_call is not None:
|
||||||
|
if isinstance(function_call, str) and (
|
||||||
|
function_call == "none" or function_call == "auto"
|
||||||
|
):
|
||||||
|
tool_choice = function_call
|
||||||
|
if isinstance(function_call, dict) and "name" in function_call:
|
||||||
|
tool_choice = {
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": function_call["name"],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
tool = None
|
||||||
|
if tool_choice is not None and isinstance(tool_choice, dict) and tools is not None:
|
||||||
|
name = tool_choice["function"]["name"]
|
||||||
|
tool = next((t for t in tools if t["function"]["name"] == name), None)
|
||||||
|
if tool is None:
|
||||||
|
raise ValueError(f"Tool choice '{name}' not found in tools.")
|
||||||
|
schema = tool["function"]["parameters"]
|
||||||
|
try:
|
||||||
|
# create grammar from json schema
|
||||||
|
grammar = llama_grammar.LlamaGrammar.from_json_schema(
|
||||||
|
json.dumps(schema), verbose=llama.verbose
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
grammar = llama_grammar.LlamaGrammar.from_string(
|
||||||
|
llama_grammar.JSON_GBNF, verbose=llama.verbose
|
||||||
|
)
|
||||||
|
|
||||||
completion_or_chunks = llama.create_completion(
|
completion_or_chunks = llama.create_completion(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
|
@ -364,6 +592,11 @@ def chat_formatter_to_chat_completion_handler(
|
||||||
grammar=grammar,
|
grammar=grammar,
|
||||||
logit_bias=logit_bias,
|
logit_bias=logit_bias,
|
||||||
)
|
)
|
||||||
|
if tool is not None:
|
||||||
|
tool_name = tool["function"]["name"]
|
||||||
|
return _convert_completion_to_chat_function(
|
||||||
|
tool_name, completion_or_chunks, stream
|
||||||
|
)
|
||||||
return _convert_completion_to_chat(completion_or_chunks, stream=stream)
|
return _convert_completion_to_chat(completion_or_chunks, stream=stream)
|
||||||
|
|
||||||
return chat_completion_handler
|
return chat_completion_handler
|
||||||
|
@ -1596,13 +1829,15 @@ def functionary_v1_v2_chat_handler(
|
||||||
function_call = (
|
function_call = (
|
||||||
tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
|
tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
function_call = "auto"
|
||||||
|
|
||||||
prompt = prepare_messages_for_inference(
|
prompt = prepare_messages_for_inference(
|
||||||
messages, tokenizer, version, functions, tools
|
messages, tokenizer, version, functions, tools
|
||||||
)
|
)
|
||||||
|
|
||||||
# If no tools/functions are provided
|
# If no tools/functions are provided
|
||||||
if function_call is None and (functions is None or len(functions) == 0):
|
if function_call == "none" or functions is None or len(functions) == 0:
|
||||||
if version == "v1":
|
if version == "v1":
|
||||||
stop = END_ASSISTANT_TOKEN
|
stop = END_ASSISTANT_TOKEN
|
||||||
else:
|
else:
|
||||||
|
@ -1630,6 +1865,7 @@ def functionary_v1_v2_chat_handler(
|
||||||
logits_processor=logits_processor,
|
logits_processor=logits_processor,
|
||||||
grammar=grammar,
|
grammar=grammar,
|
||||||
)
|
)
|
||||||
|
completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
|
||||||
return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore
|
return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore
|
||||||
|
|
||||||
assert stream is False # TODO: support stream mode
|
assert stream is False # TODO: support stream mode
|
||||||
|
@ -1692,13 +1928,12 @@ def functionary_v1_v2_chat_handler(
|
||||||
|
|
||||||
return completion
|
return completion
|
||||||
|
|
||||||
|
content = ""
|
||||||
function_calls, function_bodies = [], []
|
function_calls, function_bodies = [], []
|
||||||
|
|
||||||
if version == "v1":
|
if version == "v1":
|
||||||
# If no or "auto" tool_choice/function_call
|
# If no or "auto" tool_choice/function_call
|
||||||
if function_call is None or (
|
if isinstance(function_call, str) and function_call == "auto":
|
||||||
isinstance(function_call, str) and function_call == "auto"
|
|
||||||
):
|
|
||||||
stops = ["\n", END_ASSISTANT_TOKEN]
|
stops = ["\n", END_ASSISTANT_TOKEN]
|
||||||
# If tool_choice/function_call is "none"
|
# If tool_choice/function_call is "none"
|
||||||
elif isinstance(function_call, str) and function_call == "none":
|
elif isinstance(function_call, str) and function_call == "none":
|
||||||
|
@ -1747,70 +1982,67 @@ def functionary_v1_v2_chat_handler(
|
||||||
else:
|
else:
|
||||||
function_bodies.append(completion_text.strip())
|
function_bodies.append(completion_text.strip())
|
||||||
else:
|
else:
|
||||||
# Loop until all parallel function calls are generated
|
|
||||||
while True:
|
|
||||||
# If no or "auto" tool_choice/function_call
|
|
||||||
if function_call is None or (
|
|
||||||
isinstance(function_call, str) and function_call == "auto"
|
|
||||||
):
|
|
||||||
grammar = None
|
|
||||||
stops = CONTENT_TOKEN
|
|
||||||
# If tool_choice/function_call is "none"
|
# If tool_choice/function_call is "none"
|
||||||
elif isinstance(function_call, str) and function_call == "none":
|
if isinstance(function_call, str) and function_call == "none":
|
||||||
prompt = (
|
prompt = (
|
||||||
prepare_messages_for_inference(messages, tokenizer, version, [], [])
|
prepare_messages_for_inference(messages, tokenizer, version, [], [])
|
||||||
+ "all\n<|content|>"
|
+ "all\n<|content|>"
|
||||||
)
|
)
|
||||||
stops = STOP_TOKEN
|
stops = [STOP_TOKEN, FROM_TOKEN]
|
||||||
|
completion = create_completion(stop=stops)
|
||||||
|
completion["choices"][0]["text"] = completion["choices"][0]["text"].strip()
|
||||||
|
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
|
||||||
# If tool_choice/function_call is provided
|
# If tool_choice/function_call is provided
|
||||||
elif isinstance(function_call, dict):
|
elif isinstance(function_call, dict):
|
||||||
prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
|
prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
|
||||||
stops = STOP_TOKEN
|
|
||||||
function_call = function_call["name"]
|
function_call = function_call["name"]
|
||||||
function_calls.append(function_call)
|
function_calls.append(function_call)
|
||||||
grammar = get_grammar(function_call)
|
grammar = get_grammar(function_call)
|
||||||
else:
|
stops = [STOP_TOKEN, FROM_TOKEN]
|
||||||
prompt = prompt
|
|
||||||
stops = STOP_TOKEN
|
|
||||||
|
|
||||||
completion = create_completion(stop=stops)
|
completion = create_completion(stop=stops)
|
||||||
completion_text = completion["choices"][0]["text"]
|
completion_text = completion["choices"][0]["text"]
|
||||||
|
function_bodies.append(completion_text.strip())
|
||||||
# If the generation does not involve a function call
|
# If "auto" or no tool_choice/function_call
|
||||||
if prompt.endswith("all\n<|content|>") and not completion_text.startswith(
|
elif isinstance(function_call, str) and function_call == "auto":
|
||||||
"all"
|
while True:
|
||||||
):
|
# Generate function name first
|
||||||
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
|
|
||||||
# Generate model response if the model decides not to call any function
|
|
||||||
elif prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all"):
|
|
||||||
prompt += completion_text + CONTENT_TOKEN
|
|
||||||
completion = create_completion(stop=STOP_TOKEN)
|
|
||||||
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
|
|
||||||
# Generate parameters if model decides to call a function
|
|
||||||
elif prompt.endswith(RECIPIENT_TOKEN):
|
|
||||||
function_calls.append(completion_text[:-1])
|
|
||||||
grammar = get_grammar(function_calls[-1])
|
|
||||||
completion = create_completion(stop=[STOP_TOKEN, "\n"])
|
|
||||||
function_bodies.append(completion["choices"][0]["text"].strip())
|
|
||||||
prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}"
|
|
||||||
grammar = None
|
grammar = None
|
||||||
|
stops = CONTENT_TOKEN
|
||||||
# Try to generate the beginning of next turn
|
completion = create_completion(stop=stops)
|
||||||
# If empty completion, break from loop
|
completion_text = completion["choices"][0]["text"]
|
||||||
next_turn_completion_text = create_completion(
|
function_name = completion_text.strip()
|
||||||
stop=[STOP_TOKEN, RECIPIENT_TOKEN]
|
if function_name == "all":
|
||||||
)["choices"][0]["text"]
|
prompt += "all\n<|content|>"
|
||||||
if len(next_turn_completion_text) > 0:
|
else:
|
||||||
prompt += f"\n{FROM_TOKEN}assistant\n{RECIPIENT_TOKEN}"
|
function_call = completion_text.strip()
|
||||||
|
prompt += f"{function_call}\n<|content|>"
|
||||||
|
function_calls.append(function_call)
|
||||||
|
grammar = get_grammar(function_call)
|
||||||
|
# Generate content
|
||||||
|
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
|
||||||
|
completion = create_completion(stop=stops)
|
||||||
|
completion_text = completion["choices"][0]["text"]
|
||||||
|
if function_name == "all":
|
||||||
|
content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n")
|
||||||
|
content = content.lstrip()
|
||||||
|
# Check whether the model wants to generate another turn
|
||||||
|
if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
|
||||||
|
cleaned_completion_text = completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n").strip()
|
||||||
|
prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
# Break from loop if tool_choice/function_call is provided as a dict
|
|
||||||
else:
|
else:
|
||||||
function_bodies.append(completion_text.strip())
|
function_bodies.append(completion_text.strip())
|
||||||
|
# Check whether the model wants to generate another turn
|
||||||
|
prompt += completion_text.strip()
|
||||||
|
grammar = None
|
||||||
|
completion = create_completion(stop=stops)
|
||||||
|
if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
|
||||||
|
prompt += "\n<|from|>assistant\n<|recipient|>"
|
||||||
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
assert "usage" in completion
|
assert "usage" in completion
|
||||||
assert len(function_calls) > 0
|
|
||||||
assert len(function_calls) == len(function_bodies)
|
assert len(function_calls) == len(function_bodies)
|
||||||
|
|
||||||
tool_calls = []
|
tool_calls = []
|
||||||
|
@ -1843,14 +2075,14 @@ def functionary_v1_v2_chat_handler(
|
||||||
"index": 0,
|
"index": 0,
|
||||||
"message": {
|
"message": {
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
"content": None,
|
"content": None if content == "" else content,
|
||||||
"function_call": {
|
"function_call": {
|
||||||
"name": tool_calls[0]["function"]["name"],
|
"name": tool_calls[0]["function"]["name"],
|
||||||
"arguments": tool_calls[0]["function"]["arguments"],
|
"arguments": tool_calls[0]["function"]["arguments"],
|
||||||
|
} if len(tool_calls) > 0 else None,
|
||||||
|
"tool_calls": tool_calls if len(tool_calls) > 0 else None,
|
||||||
},
|
},
|
||||||
"tool_calls": tool_calls,
|
"finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
|
||||||
},
|
|
||||||
"finish_reason": "tool_calls",
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
usage=completion["usage"],
|
usage=completion["usage"],
|
||||||
|
@ -2199,181 +2431,6 @@ def chatml_function_calling(
|
||||||
stream=stream,
|
stream=stream,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _convert_completion_to_chat_function(
|
|
||||||
tool_name: str,
|
|
||||||
completion_or_chunks: Union[
|
|
||||||
llama_types.CreateCompletionResponse,
|
|
||||||
Iterator[llama_types.CreateCompletionStreamResponse],
|
|
||||||
],
|
|
||||||
stream: bool,
|
|
||||||
):
|
|
||||||
if not stream:
|
|
||||||
completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore
|
|
||||||
assert "usage" in completion
|
|
||||||
tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
|
|
||||||
# TODO: Fix for legacy function calls
|
|
||||||
chat_completion: llama_types.CreateChatCompletionResponse = {
|
|
||||||
"id": "chat" + completion["id"],
|
|
||||||
"object": "chat.completion",
|
|
||||||
"created": completion["created"],
|
|
||||||
"model": completion["model"],
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"message": {
|
|
||||||
"role": "assistant",
|
|
||||||
"content": None,
|
|
||||||
"function_call": {
|
|
||||||
"name": tool_name,
|
|
||||||
"arguments": completion["choices"][0]["text"],
|
|
||||||
},
|
|
||||||
"tool_calls": [
|
|
||||||
{
|
|
||||||
"id": tool_id,
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": tool_name,
|
|
||||||
"arguments": completion["choices"][0]["text"],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"finish_reason": "tool_calls",
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"usage": completion["usage"],
|
|
||||||
}
|
|
||||||
return chat_completion
|
|
||||||
else:
|
|
||||||
chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore
|
|
||||||
|
|
||||||
def _stream_response_to_function_stream(
|
|
||||||
chunks: Iterator[llama_types.CreateCompletionStreamResponse],
|
|
||||||
) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
|
|
||||||
# blank first message
|
|
||||||
first = True
|
|
||||||
id_ = None
|
|
||||||
created = None
|
|
||||||
model = None
|
|
||||||
tool_id = None
|
|
||||||
for chunk in chunks:
|
|
||||||
if first:
|
|
||||||
id_ = "chat" + chunk["id"]
|
|
||||||
created = chunk["created"]
|
|
||||||
model = chunk["model"]
|
|
||||||
tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
|
|
||||||
yield {
|
|
||||||
"id": id_,
|
|
||||||
"object": "chat.completion.chunk",
|
|
||||||
"created": created,
|
|
||||||
"model": model,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"finish_reason": None,
|
|
||||||
"logprobs": None,
|
|
||||||
"delta": {
|
|
||||||
"role": "assistant",
|
|
||||||
"content": None,
|
|
||||||
"function_call": None,
|
|
||||||
"tool_calls": None,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}
|
|
||||||
yield {
|
|
||||||
"id": "chat" + chunk["id"],
|
|
||||||
"object": "chat.completion.chunk",
|
|
||||||
"created": chunk["created"],
|
|
||||||
"model": chunk["model"],
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"finish_reason": None,
|
|
||||||
"logprobs": None,
|
|
||||||
"delta": {
|
|
||||||
"role": None,
|
|
||||||
"content": None,
|
|
||||||
"function_call": {
|
|
||||||
"name": tool_name,
|
|
||||||
"arguments": chunk["choices"][0]["text"],
|
|
||||||
},
|
|
||||||
"tool_calls": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"id": tool_id,
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": tool_name,
|
|
||||||
"arguments": "",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}
|
|
||||||
first = False
|
|
||||||
continue
|
|
||||||
assert tool_id is not None
|
|
||||||
yield {
|
|
||||||
"id": "chat" + chunk["id"],
|
|
||||||
"object": "chat.completion.chunk",
|
|
||||||
"created": chunk["created"],
|
|
||||||
"model": chunk["model"],
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"finish_reason": None,
|
|
||||||
"logprobs": None,
|
|
||||||
"delta": {
|
|
||||||
"role": None,
|
|
||||||
"content": None,
|
|
||||||
"function_call": {
|
|
||||||
"name": tool_name,
|
|
||||||
"arguments": chunk["choices"][0]["text"],
|
|
||||||
},
|
|
||||||
"tool_calls": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"id": tool_id,
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": tool_name,
|
|
||||||
"arguments": chunk["choices"][0][
|
|
||||||
"text"
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
if id_ is not None and created is not None and model is not None:
|
|
||||||
yield {
|
|
||||||
"id": id_,
|
|
||||||
"object": "chat.completion.chunk",
|
|
||||||
"created": created,
|
|
||||||
"model": model,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"finish_reason": "tool_calls",
|
|
||||||
"logprobs": None,
|
|
||||||
"delta": {
|
|
||||||
"role": None,
|
|
||||||
"content": None,
|
|
||||||
"function_call": None,
|
|
||||||
"tool_calls": None,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
return _stream_response_to_function_stream(chunks)
|
|
||||||
|
|
||||||
# Case 2: Tool choice by user
|
# Case 2: Tool choice by user
|
||||||
if isinstance(tool_choice, dict):
|
if isinstance(tool_choice, dict):
|
||||||
tool_name = tool_choice["function"]["name"]
|
tool_name = tool_choice["function"]["name"]
|
||||||
|
|
|
@ -581,6 +581,7 @@ class llama_model_params(ctypes.Structure):
|
||||||
# bool embeddings; // if true, extract embeddings (together with logits)
|
# bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
|
|
||||||
|
|
||||||
# // Abort callback
|
# // Abort callback
|
||||||
# // if it returns true, execution of llama_decode() will be aborted
|
# // if it returns true, execution of llama_decode() will be aborted
|
||||||
# // currently works only with CPU execution
|
# // currently works only with CPU execution
|
||||||
|
@ -669,6 +670,8 @@ It might not exist for progress report where '.' is output repeatedly."""
|
||||||
# typedef struct llama_model_quantize_params {
|
# typedef struct llama_model_quantize_params {
|
||||||
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||||
# enum llama_ftype ftype; // quantize to this llama_ftype
|
# enum llama_ftype ftype; // quantize to this llama_ftype
|
||||||
|
# enum ggml_type output_tensor_type; // output tensor type
|
||||||
|
# enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
||||||
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
# bool quantize_output_tensor; // quantize output.weight
|
# bool quantize_output_tensor; // quantize output.weight
|
||||||
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
|
@ -681,16 +684,20 @@ class llama_model_quantize_params(ctypes.Structure):
|
||||||
Attributes:
|
Attributes:
|
||||||
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||||
ftype (int): quantize to this llama_ftype
|
ftype (int): quantize to this llama_ftype
|
||||||
|
output_tensor_type (int): output tensor type
|
||||||
|
token_embedding_type (int): itoken embeddings tensor type
|
||||||
allow_requantize (bool): allow quantizing non-f32/f16 tensors
|
allow_requantize (bool): allow quantizing non-f32/f16 tensors
|
||||||
quantize_output_tensor (bool): quantize output.weight
|
quantize_output_tensor (bool): quantize output.weight
|
||||||
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
pure (bool): quantize all tensors to the default type
|
pure (bool): quantize all tensors to the default type
|
||||||
imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data
|
imatrix (ctypes.c_void_p): pointer to importance matrix data
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_fields_ = [
|
_fields_ = [
|
||||||
("nthread", ctypes.c_int32),
|
("nthread", ctypes.c_int32),
|
||||||
("ftype", ctypes.c_int),
|
("ftype", ctypes.c_int),
|
||||||
|
("output_tensor_type", ctypes.c_int),
|
||||||
|
("token_embedding_type", ctypes.c_int),
|
||||||
("allow_requantize", ctypes.c_bool),
|
("allow_requantize", ctypes.c_bool),
|
||||||
("quantize_output_tensor", ctypes.c_bool),
|
("quantize_output_tensor", ctypes.c_bool),
|
||||||
("only_copy", ctypes.c_bool),
|
("only_copy", ctypes.c_bool),
|
||||||
|
@ -1006,6 +1013,11 @@ def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
|
||||||
def llama_n_embd(model: llama_model_p, /) -> int: ...
|
def llama_n_embd(model: llama_model_p, /) -> int: ...
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
||||||
|
@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
|
||||||
|
def llama_n_layer(model: llama_model_p, /) -> int: ...
|
||||||
|
|
||||||
|
|
||||||
# // Get the model's RoPE frequency scaling factor
|
# // Get the model's RoPE frequency scaling factor
|
||||||
# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||||
@ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
|
@ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
|
||||||
|
@ -1166,6 +1178,12 @@ def llama_model_quantize(
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# // Apply a LoRA adapter to a loaded model
|
||||||
|
# // path_base_model is the path to a higher quality model to use as a base for
|
||||||
|
# // the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||||
|
# // The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||||
|
# // will be applied on top of the previous one
|
||||||
|
# // Returns 0 on success
|
||||||
# LLAMA_API int32_t llama_model_apply_lora_from_file(
|
# LLAMA_API int32_t llama_model_apply_lora_from_file(
|
||||||
# const struct llama_model * model,
|
# const struct llama_model * model,
|
||||||
# const char * path_lora,
|
# const char * path_lora,
|
||||||
|
@ -1190,7 +1208,57 @@ def llama_model_apply_lora_from_file(
|
||||||
path_base_model: Union[ctypes.c_char_p, bytes, None],
|
path_base_model: Union[ctypes.c_char_p, bytes, None],
|
||||||
n_threads: Union[ctypes.c_int32, int],
|
n_threads: Union[ctypes.c_int32, int],
|
||||||
/,
|
/,
|
||||||
) -> int: ...
|
) -> int:
|
||||||
|
"""Apply a LoRA adapter to a loaded model
|
||||||
|
path_base_model is the path to a higher quality model to use as a base for
|
||||||
|
the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||||
|
The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||||
|
will be applied on top of the previous one
|
||||||
|
Returns 0 on success"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# // Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||||
|
# // the currently loaded vector.
|
||||||
|
# // n_embd should be the size of a single layer's control, and data should point
|
||||||
|
# // to an n_embd x n_layers buffer starting from layer 1.
|
||||||
|
# // il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||||
|
# // See llama_control_vector_load in common to load a control vector.
|
||||||
|
# LLAMA_API int32_t llama_control_vector_apply(
|
||||||
|
# struct llama_context * lctx,
|
||||||
|
# const float * data,
|
||||||
|
# size_t len,
|
||||||
|
# int32_t n_embd,
|
||||||
|
# int32_t il_start,
|
||||||
|
# int32_t il_end);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_control_vector_apply",
|
||||||
|
[
|
||||||
|
llama_context_p_ctypes,
|
||||||
|
ctypes.POINTER(ctypes.c_float),
|
||||||
|
ctypes.c_size_t,
|
||||||
|
ctypes.c_int32,
|
||||||
|
ctypes.c_int32,
|
||||||
|
ctypes.c_int32,
|
||||||
|
],
|
||||||
|
ctypes.c_int32,
|
||||||
|
)
|
||||||
|
def llama_control_vector_apply(
|
||||||
|
lctx: llama_context_p,
|
||||||
|
data: CtypesPointerOrRef[ctypes.c_float],
|
||||||
|
len: int,
|
||||||
|
n_embd: int,
|
||||||
|
il_start: int,
|
||||||
|
il_end: int,
|
||||||
|
/,
|
||||||
|
) -> int:
|
||||||
|
"""Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||||
|
the currently loaded vector.
|
||||||
|
n_embd should be the size of a single layer's control, and data should point
|
||||||
|
to an n_embd x n_layers buffer starting from layer 1.
|
||||||
|
il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||||
|
See llama_control_vector_load in common to load a control vector."""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
# //
|
# //
|
||||||
|
@ -1205,6 +1273,12 @@ def llama_model_apply_lora_from_file(
|
||||||
# llama_pos pos;
|
# llama_pos pos;
|
||||||
# };
|
# };
|
||||||
class llama_kv_cache_view_cell(ctypes.Structure):
|
class llama_kv_cache_view_cell(ctypes.Structure):
|
||||||
|
"""Information associated with an individual cell in the KV cache view.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
|
||||||
|
May be negative if the cell is not populated."""
|
||||||
|
|
||||||
_fields_ = [("pos", llama_pos)]
|
_fields_ = [("pos", llama_pos)]
|
||||||
|
|
||||||
|
|
||||||
|
@ -1998,7 +2072,8 @@ def llama_tokenize(
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Returns the number of tokens on success, no more than n_tokens_max
|
Returns the number of tokens on success, no more than n_tokens_max
|
||||||
Returns a negative number on failure - the number of tokens that would have been returned"""
|
Returns a negative number on failure - the number of tokens that would have been returned
|
||||||
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@ -2674,6 +2749,48 @@ def llama_beam_search(
|
||||||
): ...
|
): ...
|
||||||
|
|
||||||
|
|
||||||
|
# /// @details Build a split GGUF final path for this chunk.
|
||||||
|
# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
||||||
|
# // Returns the split_path length.
|
||||||
|
# LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_split_path",
|
||||||
|
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
|
||||||
|
ctypes.c_int,
|
||||||
|
)
|
||||||
|
def llama_split_path(
|
||||||
|
split_path: bytes,
|
||||||
|
maxlen: Union[ctypes.c_size_t, int],
|
||||||
|
path_prefix: bytes,
|
||||||
|
split_no: Union[ctypes.c_int, int],
|
||||||
|
split_count: Union[ctypes.c_int, int],
|
||||||
|
/,
|
||||||
|
) -> int:
|
||||||
|
"""Build a split GGUF final path for this chunk."""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
||||||
|
# /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
||||||
|
# // Returns the split_prefix length.
|
||||||
|
# LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_split_prefix",
|
||||||
|
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
|
||||||
|
ctypes.c_int,
|
||||||
|
)
|
||||||
|
def llama_split_prefix(
|
||||||
|
split_prefix: bytes,
|
||||||
|
maxlen: Union[ctypes.c_size_t, int],
|
||||||
|
split_path: bytes,
|
||||||
|
split_no: Union[ctypes.c_int, int],
|
||||||
|
split_count: Union[ctypes.c_int, int],
|
||||||
|
/,
|
||||||
|
) -> int:
|
||||||
|
"""Extract the path prefix from the split_path if and only if the split_no and split_count match."""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
# Performance information
|
# Performance information
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,14 +12,7 @@ import llama_cpp
|
||||||
import anyio
|
import anyio
|
||||||
from anyio.streams.memory import MemoryObjectSendStream
|
from anyio.streams.memory import MemoryObjectSendStream
|
||||||
from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
|
from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
|
||||||
from fastapi import (
|
from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body
|
||||||
Depends,
|
|
||||||
FastAPI,
|
|
||||||
APIRouter,
|
|
||||||
Request,
|
|
||||||
HTTPException,
|
|
||||||
status,
|
|
||||||
)
|
|
||||||
from fastapi.middleware import Middleware
|
from fastapi.middleware import Middleware
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.security import HTTPBearer
|
from fastapi.security import HTTPBearer
|
||||||
|
@ -356,7 +349,64 @@ async def create_embedding(
|
||||||
)
|
)
|
||||||
async def create_chat_completion(
|
async def create_chat_completion(
|
||||||
request: Request,
|
request: Request,
|
||||||
body: CreateChatCompletionRequest,
|
body: CreateChatCompletionRequest = Body(
|
||||||
|
openapi_examples={
|
||||||
|
"normal": {
|
||||||
|
"summary": "Chat Completion",
|
||||||
|
"value": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "What is the capital of France?"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"json_mode": {
|
||||||
|
"summary": "JSON Mode",
|
||||||
|
"value": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "Who won the world series in 2020"},
|
||||||
|
],
|
||||||
|
"response_format": { "type": "json_object" }
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"tool_calling": {
|
||||||
|
"summary": "Tool Calling",
|
||||||
|
"value": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "Extract Jason is 30 years old."},
|
||||||
|
],
|
||||||
|
"tools": [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "User",
|
||||||
|
"description": "User record",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"name": {"type": "string"},
|
||||||
|
"age": {"type": "number"},
|
||||||
|
},
|
||||||
|
"required": ["name", "age"],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"tool_choice": {
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "User",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
),
|
||||||
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
|
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
|
||||||
) -> llama_cpp.ChatCompletion:
|
) -> llama_cpp.ChatCompletion:
|
||||||
exclude = {
|
exclude = {
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
|
Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652
|
Loading…
Reference in a new issue