From 8d298b47507c9d82945f0add20d1ddb3e8ea0aa4 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 18 Mar 2024 10:26:36 -0400 Subject: [PATCH 01/11] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 85 ++++++++++++++++++++++++++++++++++++++---- vendor/llama.cpp | 2 +- 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index b9593cf..6b5c1bc 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -581,6 +581,7 @@ class llama_model_params(ctypes.Structure): # bool embeddings; // if true, extract embeddings (together with logits) # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU + # // Abort callback # // if it returns true, execution of llama_decode() will be aborted # // currently works only with CPU execution @@ -1006,6 +1007,11 @@ def llama_n_ctx_train(model: llama_model_p, /) -> int: ... def llama_n_embd(model: llama_model_p, /) -> int: ... +# LLAMA_API int32_t llama_n_layer (const struct llama_model * model); +@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32) +def llama_n_layer(model: llama_model_p, /) -> int: ... + + # // Get the model's RoPE frequency scaling factor # LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); @ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float) @@ -1166,12 +1172,18 @@ def llama_model_quantize( ... +# // Apply a LoRA adapter to a loaded model +# // path_base_model is the path to a higher quality model to use as a base for +# // the layers modified by the adapter. Can be NULL to use the current loaded model. +# // The model needs to be reloaded before applying a new adapter, otherwise the adapter +# // will be applied on top of the previous one +# // Returns 0 on success # LLAMA_API int32_t llama_model_apply_lora_from_file( # const struct llama_model * model, -# const char * path_lora, -# float scale, -# const char * path_base_model, -# int32_t n_threads); +# const char * path_lora, +# float scale, +# const char * path_base_model, +# int32_t n_threads); @ctypes_function( "llama_model_apply_lora_from_file", [ @@ -1190,7 +1202,57 @@ def llama_model_apply_lora_from_file( path_base_model: Union[ctypes.c_char_p, bytes, None], n_threads: Union[ctypes.c_int32, int], /, -) -> int: ... +) -> int: + """Apply a LoRA adapter to a loaded model + path_base_model is the path to a higher quality model to use as a base for + the layers modified by the adapter. Can be NULL to use the current loaded model. + The model needs to be reloaded before applying a new adapter, otherwise the adapter + will be applied on top of the previous one + Returns 0 on success""" + ... + + +# // Apply a loaded control vector to a llama_context, or if data is NULL, clear +# // the currently loaded vector. +# // n_embd should be the size of a single layer's control, and data should point +# // to an n_embd x n_layers buffer starting from layer 1. +# // il_start and il_end are the layer range the vector should apply to (both inclusive) +# // See llama_control_vector_load in common to load a control vector. +# LLAMA_API int32_t llama_control_vector_apply( +# struct llama_context * lctx, +# const float * data, +# size_t len, +# int32_t n_embd, +# int32_t il_start, +# int32_t il_end); +@ctypes_function( + "llama_control_vector_apply", + [ + llama_context_p_ctypes, + ctypes.POINTER(ctypes.c_float), + ctypes.c_size_t, + ctypes.c_int32, + ctypes.c_int32, + ctypes.c_int32, + ], + ctypes.c_int32, +) +def llama_control_vector_apply( + lctx: llama_context_p, + data: CtypesPointerOrRef[ctypes.c_float], + len: int, + n_embd: int, + il_start: int, + il_end: int, + /, +) -> int: + """Apply a loaded control vector to a llama_context, or if data is NULL, clear + the currently loaded vector. + n_embd should be the size of a single layer's control, and data should point + to an n_embd x n_layers buffer starting from layer 1. + il_start and il_end are the layer range the vector should apply to (both inclusive) + See llama_control_vector_load in common to load a control vector.""" + ... # // @@ -1205,6 +1267,12 @@ def llama_model_apply_lora_from_file( # llama_pos pos; # }; class llama_kv_cache_view_cell(ctypes.Structure): + """Information associated with an individual cell in the KV cache view. + + Attributes: + pos (llama_pos): The position for this cell. Takes KV cache shifts into account. + May be negative if the cell is not populated.""" + _fields_ = [("pos", llama_pos)] @@ -1985,7 +2053,7 @@ def llama_tokenize( /, ) -> int: """Convert the provided text into tokens. - + Args: model: The model to use for tokenization. text: The text to tokenize. @@ -1995,10 +2063,11 @@ def llama_tokenize( add_bos: Whether to add a beginning-of-sentence token. special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space. - + Returns: Returns the number of tokens on success, no more than n_tokens_max - Returns a negative number on failure - the number of tokens that would have been returned""" + Returns a negative number on failure - the number of tokens that would have been returned + """ ... diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4e9a7f7..ac9ee6a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc +Subproject commit ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1 From 8a60c7bc8cae7aa9770eeac0f482d39350763a6f Mon Sep 17 00:00:00 2001 From: Jeffrey Fong Date: Mon, 18 Mar 2024 22:40:57 +0800 Subject: [PATCH 02/11] fix: Fix and optimize functionary chat handler (#1282) * fix functionary chat logic * further fixes --------- Co-authored-by: Andrei --- llama_cpp/llama_chat_format.py | 131 ++++++++++++++++----------------- 1 file changed, 65 insertions(+), 66 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 81ca552..c89cce8 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1596,13 +1596,15 @@ def functionary_v1_v2_chat_handler( function_call = ( tool_choice if isinstance(tool_choice, str) else tool_choice["function"] ) + else: + function_call = "auto" prompt = prepare_messages_for_inference( messages, tokenizer, version, functions, tools ) # If no tools/functions are provided - if function_call is None and (functions is None or len(functions) == 0): + if function_call == "none" or functions is None or len(functions) == 0: if version == "v1": stop = END_ASSISTANT_TOKEN else: @@ -1630,6 +1632,7 @@ def functionary_v1_v2_chat_handler( logits_processor=logits_processor, grammar=grammar, ) + completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip() return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore assert stream is False # TODO: support stream mode @@ -1692,13 +1695,12 @@ def functionary_v1_v2_chat_handler( return completion + content = "" function_calls, function_bodies = [], [] if version == "v1": # If no or "auto" tool_choice/function_call - if function_call is None or ( - isinstance(function_call, str) and function_call == "auto" - ): + if isinstance(function_call, str) and function_call == "auto": stops = ["\n", END_ASSISTANT_TOKEN] # If tool_choice/function_call is "none" elif isinstance(function_call, str) and function_call == "none": @@ -1747,70 +1749,67 @@ def functionary_v1_v2_chat_handler( else: function_bodies.append(completion_text.strip()) else: - # Loop until all parallel function calls are generated - while True: - # If no or "auto" tool_choice/function_call - if function_call is None or ( - isinstance(function_call, str) and function_call == "auto" - ): - grammar = None - stops = CONTENT_TOKEN - # If tool_choice/function_call is "none" - elif isinstance(function_call, str) and function_call == "none": - prompt = ( - prepare_messages_for_inference(messages, tokenizer, version, [], []) - + "all\n<|content|>" - ) - stops = STOP_TOKEN - # If tool_choice/function_call is provided - elif isinstance(function_call, dict): - prompt += f"{function_call['name']}\n{CONTENT_TOKEN}" - stops = STOP_TOKEN - function_call = function_call["name"] - function_calls.append(function_call) - grammar = get_grammar(function_call) - else: - prompt = prompt - stops = STOP_TOKEN - + # If tool_choice/function_call is "none" + if isinstance(function_call, str) and function_call == "none": + prompt = ( + prepare_messages_for_inference(messages, tokenizer, version, [], []) + + "all\n<|content|>" + ) + stops = [STOP_TOKEN, FROM_TOKEN] + completion = create_completion(stop=stops) + completion["choices"][0]["text"] = completion["choices"][0]["text"].strip() + return _convert_completion_to_chat(completion, stream=stream) # type: ignore + # If tool_choice/function_call is provided + elif isinstance(function_call, dict): + prompt += f"{function_call['name']}\n{CONTENT_TOKEN}" + function_call = function_call["name"] + function_calls.append(function_call) + grammar = get_grammar(function_call) + stops = [STOP_TOKEN, FROM_TOKEN] completion = create_completion(stop=stops) completion_text = completion["choices"][0]["text"] - - # If the generation does not involve a function call - if prompt.endswith("all\n<|content|>") and not completion_text.startswith( - "all" - ): - return _convert_completion_to_chat(completion, stream=stream) # type: ignore - # Generate model response if the model decides not to call any function - elif prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all"): - prompt += completion_text + CONTENT_TOKEN - completion = create_completion(stop=STOP_TOKEN) - return _convert_completion_to_chat(completion, stream=stream) # type: ignore - # Generate parameters if model decides to call a function - elif prompt.endswith(RECIPIENT_TOKEN): - function_calls.append(completion_text[:-1]) - grammar = get_grammar(function_calls[-1]) - completion = create_completion(stop=[STOP_TOKEN, "\n"]) - function_bodies.append(completion["choices"][0]["text"].strip()) - prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}" + function_bodies.append(completion_text.strip()) + # If "auto" or no tool_choice/function_call + elif isinstance(function_call, str) and function_call == "auto": + while True: + # Generate function name first grammar = None - - # Try to generate the beginning of next turn - # If empty completion, break from loop - next_turn_completion_text = create_completion( - stop=[STOP_TOKEN, RECIPIENT_TOKEN] - )["choices"][0]["text"] - if len(next_turn_completion_text) > 0: - prompt += f"\n{FROM_TOKEN}assistant\n{RECIPIENT_TOKEN}" + stops = CONTENT_TOKEN + completion = create_completion(stop=stops) + completion_text = completion["choices"][0]["text"] + function_name = completion_text.strip() + if function_name == "all": + prompt += "all\n<|content|>" else: - break - # Break from loop if tool_choice/function_call is provided as a dict - else: - function_bodies.append(completion_text.strip()) - break + function_call = completion_text.strip() + prompt += f"{function_call}\n<|content|>" + function_calls.append(function_call) + grammar = get_grammar(function_call) + # Generate content + stops = [RECIPIENT_TOKEN, STOP_TOKEN] + completion = create_completion(stop=stops) + completion_text = completion["choices"][0]["text"] + if function_name == "all": + content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n") + content = content.lstrip() + # Check whether the model wants to generate another turn + if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text: + cleaned_completion_text = completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n").strip() + prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>" + else: + break + else: + function_bodies.append(completion_text.strip()) + # Check whether the model wants to generate another turn + prompt += completion_text.strip() + grammar = None + completion = create_completion(stop=stops) + if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]: + prompt += "\n<|from|>assistant\n<|recipient|>" + else: + break assert "usage" in completion - assert len(function_calls) > 0 assert len(function_calls) == len(function_bodies) tool_calls = [] @@ -1843,14 +1842,14 @@ def functionary_v1_v2_chat_handler( "index": 0, "message": { "role": "assistant", - "content": None, + "content": None if content == "" else content, "function_call": { "name": tool_calls[0]["function"]["name"], "arguments": tool_calls[0]["function"]["arguments"], - }, - "tool_calls": tool_calls, + } if len(tool_calls) > 0 else None, + "tool_calls": tool_calls if len(tool_calls) > 0 else None, }, - "finish_reason": "tool_calls", + "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop", } ], usage=completion["usage"], From bf64752535ac73032a25d6ba9ae0f064246964f2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 18 Mar 2024 11:37:30 -0400 Subject: [PATCH 03/11] chore: Bump version --- CHANGELOG.md | 7 +++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 90dd1e6..a85eaa4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.57] + +- feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1 +- fix: set default embedding pooling type to unspecified by @abetlen in 4084aabe867b8ec2aba1b22659e59c9318b0d1f3 +- fix: Fix and optimize functionary chat handler by @jeffrey-fong in #1282 +- fix: json mode for basic chat formats by @abetlen in 20e6815252d0efd9f015f7adbf108faaf36e3f3c + ## [0.2.56] - feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index fcbc715..1e802fa 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.56" \ No newline at end of file +__version__ = "0.2.57" \ No newline at end of file From 18d7ce918f45bc2bcf80102239142c49ebe29925 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 19 Mar 2024 04:40:24 -0400 Subject: [PATCH 04/11] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ac9ee6a..b80cf3b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1 +Subproject commit b80cf3b2d1dee0ad325f7a794fecc66befce7336 From 60d8498f212ca1eb4303d95610022a055718bbb8 Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 19 Mar 2024 04:55:57 -0400 Subject: [PATCH 05/11] feat: Add tools/functions variables to Jinja2ChatFormatter, add function response formatting for all simple chat formats (#1273) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add tools/functions variables to Jinja2ChatFormatter Also fixed missing tools/tool_choices parameters in chat_formatter_to_chat_completion_handler(). * Set grammar when doing explicit function calling * Add function / tool response for all chat formats --------- Co-authored-by: Sigbjørn Skjæret --- llama_cpp/llama_chat_format.py | 408 +++++++++++++++++++-------------- 1 file changed, 233 insertions(+), 175 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index c89cce8..5bda163 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -188,6 +188,10 @@ class Jinja2ChatFormatter(ChatFormatter): self, *, messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, **kwargs: Any, ) -> ChatFormatterResponse: def raise_exception(message: str): @@ -199,6 +203,10 @@ class Jinja2ChatFormatter(ChatFormatter): bos_token=self.bos_token, raise_exception=raise_exception, add_generation_prompt=self.add_generation_prompt, + functions=functions, + function_call=function_call, + tools=tools, + tool_choice=tool_choice, ) return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token]) @@ -288,6 +296,183 @@ def _convert_completion_to_chat( return _convert_text_completion_to_chat(completion) +def _convert_completion_to_chat_function( + tool_name: str, + completion_or_chunks: Union[ + llama_types.CreateCompletionResponse, + Iterator[llama_types.CreateCompletionStreamResponse], + ], + stream: bool, +): + if not stream: + completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore + assert "usage" in completion + tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"] + # TODO: Fix for legacy function calls + chat_completion: llama_types.CreateChatCompletionResponse = { + "id": "chat" + completion["id"], + "object": "chat.completion", + "created": completion["created"], + "model": completion["model"], + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": None, + "function_call": { + "name": tool_name, + "arguments": completion["choices"][0]["text"], + }, + "tool_calls": [ + { + "id": tool_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": completion["choices"][0]["text"], + }, + } + ], + }, + "finish_reason": "tool_calls", + } + ], + "usage": completion["usage"], + } + return chat_completion + else: + chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore + + def _stream_response_to_function_stream( + chunks: Iterator[llama_types.CreateCompletionStreamResponse], + ) -> Iterator[llama_types.CreateChatCompletionStreamResponse]: + # blank first message + first = True + id_ = None + created = None + model = None + tool_id = None + for chunk in chunks: + if first: + id_ = "chat" + chunk["id"] + created = chunk["created"] + model = chunk["model"] + tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"] + yield { + "id": id_, + "object": "chat.completion.chunk", + "created": created, + "model": model, + "choices": [ + { + "index": 0, + "finish_reason": None, + "logprobs": None, + "delta": { + "role": "assistant", + "content": None, + "function_call": None, + "tool_calls": None, + }, + } + ], + } + yield { + "id": "chat" + chunk["id"], + "object": "chat.completion.chunk", + "created": chunk["created"], + "model": chunk["model"], + "choices": [ + { + "index": 0, + "finish_reason": None, + "logprobs": None, + "delta": { + "role": None, + "content": None, + "function_call": { + "name": tool_name, + "arguments": chunk["choices"][0]["text"], + }, + "tool_calls": [ + { + "index": 0, + "id": tool_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": "", + }, + } + ], + }, + } + ], + } + first = False + continue + assert tool_id is not None + yield { + "id": "chat" + chunk["id"], + "object": "chat.completion.chunk", + "created": chunk["created"], + "model": chunk["model"], + "choices": [ + { + "index": 0, + "finish_reason": None, + "logprobs": None, + "delta": { + "role": None, + "content": None, + "function_call": { + "name": tool_name, + "arguments": chunk["choices"][0]["text"], + }, + "tool_calls": [ + { + "index": 0, + "id": tool_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": chunk["choices"][0][ + "text" + ], + }, + } + ], + }, + } + ], + } + + if id_ is not None and created is not None and model is not None: + yield { + "id": id_, + "object": "chat.completion.chunk", + "created": created, + "model": model, + "choices": [ + { + "index": 0, + "finish_reason": "tool_calls", + "logprobs": None, + "delta": { + "role": None, + "content": None, + "function_call": None, + "tool_calls": None, + }, + } + ], + } + + return _stream_response_to_function_stream(chunks) + + + def chat_formatter_to_chat_completion_handler( chat_formatter: ChatFormatter, ) -> LlamaChatCompletionHandler: @@ -331,6 +516,8 @@ def chat_formatter_to_chat_completion_handler( messages=messages, functions=functions, function_call=function_call, + tools=tools, + tool_choice=tool_choice, ) prompt = result.prompt if result.stop is not None: @@ -341,6 +528,47 @@ def chat_formatter_to_chat_completion_handler( if response_format is not None and response_format["type"] == "json_object": grammar = _grammar_for_response_format(response_format, verbose=llama.verbose) + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] + + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } + + tool = None + if tool_choice is not None and isinstance(tool_choice, dict) and tools is not None: + name = tool_choice["function"]["name"] + tool = next((t for t in tools if t["function"]["name"] == name), None) + if tool is None: + raise ValueError(f"Tool choice '{name}' not found in tools.") + schema = tool["function"]["parameters"] + try: + # create grammar from json schema + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(schema), verbose=llama.verbose + ) + except Exception as e: + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) + completion_or_chunks = llama.create_completion( prompt=prompt, temperature=temperature, @@ -364,6 +592,11 @@ def chat_formatter_to_chat_completion_handler( grammar=grammar, logit_bias=logit_bias, ) + if tool is not None: + tool_name = tool["function"]["name"] + return _convert_completion_to_chat_function( + tool_name, completion_or_chunks, stream + ) return _convert_completion_to_chat(completion_or_chunks, stream=stream) return chat_completion_handler @@ -2198,181 +2431,6 @@ def chatml_function_calling( stream=stream, ) - def _convert_completion_to_chat_function( - tool_name: str, - completion_or_chunks: Union[ - llama_types.CreateCompletionResponse, - Iterator[llama_types.CreateCompletionStreamResponse], - ], - stream: bool, - ): - if not stream: - completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore - assert "usage" in completion - tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"] - # TODO: Fix for legacy function calls - chat_completion: llama_types.CreateChatCompletionResponse = { - "id": "chat" + completion["id"], - "object": "chat.completion", - "created": completion["created"], - "model": completion["model"], - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": None, - "function_call": { - "name": tool_name, - "arguments": completion["choices"][0]["text"], - }, - "tool_calls": [ - { - "id": tool_id, - "type": "function", - "function": { - "name": tool_name, - "arguments": completion["choices"][0]["text"], - }, - } - ], - }, - "finish_reason": "tool_calls", - } - ], - "usage": completion["usage"], - } - return chat_completion - else: - chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore - - def _stream_response_to_function_stream( - chunks: Iterator[llama_types.CreateCompletionStreamResponse], - ) -> Iterator[llama_types.CreateChatCompletionStreamResponse]: - # blank first message - first = True - id_ = None - created = None - model = None - tool_id = None - for chunk in chunks: - if first: - id_ = "chat" + chunk["id"] - created = chunk["created"] - model = chunk["model"] - tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"] - yield { - "id": id_, - "object": "chat.completion.chunk", - "created": created, - "model": model, - "choices": [ - { - "index": 0, - "finish_reason": None, - "logprobs": None, - "delta": { - "role": "assistant", - "content": None, - "function_call": None, - "tool_calls": None, - }, - } - ], - } - yield { - "id": "chat" + chunk["id"], - "object": "chat.completion.chunk", - "created": chunk["created"], - "model": chunk["model"], - "choices": [ - { - "index": 0, - "finish_reason": None, - "logprobs": None, - "delta": { - "role": None, - "content": None, - "function_call": { - "name": tool_name, - "arguments": chunk["choices"][0]["text"], - }, - "tool_calls": [ - { - "index": 0, - "id": tool_id, - "type": "function", - "function": { - "name": tool_name, - "arguments": "", - }, - } - ], - }, - } - ], - } - first = False - continue - assert tool_id is not None - yield { - "id": "chat" + chunk["id"], - "object": "chat.completion.chunk", - "created": chunk["created"], - "model": chunk["model"], - "choices": [ - { - "index": 0, - "finish_reason": None, - "logprobs": None, - "delta": { - "role": None, - "content": None, - "function_call": { - "name": tool_name, - "arguments": chunk["choices"][0]["text"], - }, - "tool_calls": [ - { - "index": 0, - "id": tool_id, - "type": "function", - "function": { - "name": tool_name, - "arguments": chunk["choices"][0][ - "text" - ], - }, - } - ], - }, - } - ], - } - - if id_ is not None and created is not None and model is not None: - yield { - "id": id_, - "object": "chat.completion.chunk", - "created": created, - "model": model, - "choices": [ - { - "index": 0, - "finish_reason": "tool_calls", - "logprobs": None, - "delta": { - "role": None, - "content": None, - "function_call": None, - "tool_calls": None, - }, - } - ], - } - - return _stream_response_to_function_stream(chunks) - # Case 2: Tool choice by user if isinstance(tool_choice, dict): tool_name = tool_choice["function"]["name"] From f7decc956207f181710a43e2795f478ad2a5fc5e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 19 Mar 2024 10:52:53 -0400 Subject: [PATCH 06/11] docs: Add chat examples to openapi ui --- llama_cpp/server/app.py | 68 +++++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index aa6afc1..2e1081e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -12,14 +12,7 @@ import llama_cpp import anyio from anyio.streams.memory import MemoryObjectSendStream from starlette.concurrency import run_in_threadpool, iterate_in_threadpool -from fastapi import ( - Depends, - FastAPI, - APIRouter, - Request, - HTTPException, - status, -) +from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body from fastapi.middleware import Middleware from fastapi.middleware.cors import CORSMiddleware from fastapi.security import HTTPBearer @@ -356,7 +349,64 @@ async def create_embedding( ) async def create_chat_completion( request: Request, - body: CreateChatCompletionRequest, + body: CreateChatCompletionRequest = Body( + openapi_examples={ + "normal": { + "summary": "Chat Completion", + "value": { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ], + }, + }, + "json_mode": { + "summary": "JSON Mode", + "value": { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Who won the world series in 2020"}, + ], + "response_format": { "type": "json_object" } + }, + }, + "tool_calling": { + "summary": "Tool Calling", + "value": { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Extract Jason is 30 years old."}, + ], + "tools": [ + { + "type": "function", + "function": { + "name": "User", + "description": "User record", + "parameters": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "number"}, + }, + "required": ["name", "age"], + }, + } + } + ], + "tool_choice": { + "type": "function", + "function": { + "name": "User", + } + } + }, + }, + } + ), llama_proxy: LlamaProxy = Depends(get_llama_proxy), ) -> llama_cpp.ChatCompletion: exclude = { From 740f3f38125d1cc1bbe80f25944a292d0e966868 Mon Sep 17 00:00:00 2001 From: bretello Date: Wed, 20 Mar 2024 17:46:09 +0100 Subject: [PATCH 07/11] fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64 (#1289) --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index b4df8ef..7415149 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,11 @@ if (LLAMA_BUILD) set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE) set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE) endif() + + if (APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") + set(LLAMA_METAL_EMBED_LIBRARY "On" CACHE BOOL "llama: embed metal library" FORCE) + endif() + add_subdirectory(vendor/llama.cpp) install( TARGETS llama From 3db03b73027036cf336fda2448894c36d3899cab Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 20 Mar 2024 13:27:43 -0400 Subject: [PATCH 08/11] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b80cf3b..f9c7ba3 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b80cf3b2d1dee0ad325f7a794fecc66befce7336 +Subproject commit f9c7ba34476ffc4f13ae2cdb1aec493a16eb8d47 From c89be28ef945017e8b64ea5f194ae7073fd872e8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 20 Mar 2024 20:50:47 -0400 Subject: [PATCH 09/11] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f9c7ba3..42e21c6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f9c7ba34476ffc4f13ae2cdb1aec493a16eb8d47 +Subproject commit 42e21c68826f2e56b9592dccd9f3c43895b6890d From e325a831f015fdc807b436bc3c48e52cce658a18 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 22 Mar 2024 23:43:29 -0400 Subject: [PATCH 10/11] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 64 ++++++++++++++++++++++++++++++++++++------ vendor/llama.cpp | 2 +- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 6b5c1bc..1b8f6ca 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -668,13 +668,15 @@ It might not exist for progress report where '.' is output repeatedly.""" # // model quantization parameters # typedef struct llama_model_quantize_params { -# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() -# enum llama_ftype ftype; // quantize to this llama_ftype -# bool allow_requantize; // allow quantizing non-f32/f16 tensors -# bool quantize_output_tensor; // quantize output.weight -# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored -# bool pure; // quantize all tensors to the default type -# void * imatrix; // pointer to importance matrix data +# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() +# enum llama_ftype ftype; // quantize to this llama_ftype +# enum ggml_type output_tensor_type; // output tensor type +# enum ggml_type token_embedding_type; // itoken embeddings tensor type +# bool allow_requantize; // allow quantizing non-f32/f16 tensors +# bool quantize_output_tensor; // quantize output.weight +# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored +# bool pure; // quantize all tensors to the default type +# void * imatrix; // pointer to importance matrix data # } llama_model_quantize_params; class llama_model_quantize_params(ctypes.Structure): """Parameters for llama_model_quantize @@ -682,16 +684,20 @@ class llama_model_quantize_params(ctypes.Structure): Attributes: nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() ftype (int): quantize to this llama_ftype + output_tensor_type (int): output tensor type + token_embedding_type (int): itoken embeddings tensor type allow_requantize (bool): allow quantizing non-f32/f16 tensors quantize_output_tensor (bool): quantize output.weight only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored pure (bool): quantize all tensors to the default type - imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data + imatrix (ctypes.c_void_p): pointer to importance matrix data """ _fields_ = [ ("nthread", ctypes.c_int32), ("ftype", ctypes.c_int), + ("output_tensor_type", ctypes.c_int), + ("token_embedding_type", ctypes.c_int), ("allow_requantize", ctypes.c_bool), ("quantize_output_tensor", ctypes.c_bool), ("only_copy", ctypes.c_bool), @@ -2743,6 +2749,48 @@ def llama_beam_search( ): ... +# /// @details Build a split GGUF final path for this chunk. +# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" +# // Returns the split_path length. +# LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count); +@ctypes_function( + "llama_split_path", + [ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int], + ctypes.c_int, +) +def llama_split_path( + split_path: bytes, + maxlen: Union[ctypes.c_size_t, int], + path_prefix: bytes, + split_no: Union[ctypes.c_int, int], + split_count: Union[ctypes.c_int, int], + /, +) -> int: + """Build a split GGUF final path for this chunk.""" + ... + + +# /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match. +# /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0" +# // Returns the split_prefix length. +# LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count); +@ctypes_function( + "llama_split_prefix", + [ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int], + ctypes.c_int, +) +def llama_split_prefix( + split_prefix: bytes, + maxlen: Union[ctypes.c_size_t, int], + split_path: bytes, + split_no: Union[ctypes.c_int, int], + split_count: Union[ctypes.c_int, int], + /, +) -> int: + """Extract the path prefix from the split_path if and only if the split_no and split_count match.""" + ... + + # Performance information diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 42e21c6..50ccaf5 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 42e21c68826f2e56b9592dccd9f3c43895b6890d +Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652 From c1325dcdfba7cb331b4c09d110001658b94ebb9f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 22 Mar 2024 23:44:04 -0400 Subject: [PATCH 11/11] fix: tool_call missing first token. --- llama_cpp/llama_chat_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 5bda163..ccf4fd0 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -402,7 +402,7 @@ def _convert_completion_to_chat_function( "type": "function", "function": { "name": tool_name, - "arguments": "", + "arguments": chunk["choices"][0]["text"], }, } ],