feat: Implement streaming for Functionary v2 + Bug fixes (#1419)

* set up streaming for v2 * assert v2 streaming, fix tool_call vs function_call * fix streaming with tool_choice/function_call * make functions return 1 function call only when 'auto' * fix --------- Co-authored-by: Andrei <abetlen@gmail.com>
2024-05-04 22:11:20 +08:00 · 2024-05-04 22:11:20 +08:00 · 1f56c648c3
commit 1f56c648c3
parent f9b7221c8f
1 changed files with 443 additions and 133 deletions
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -1894,6 +1894,8 @@ def functionary_v1_v2_chat_handler(
        function_call = (
            tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
        )
+    elif function_call is not None:
+        pass
    else:
        function_call = "auto"

@ -1930,11 +1932,10 @@ def functionary_v1_v2_chat_handler(
            logits_processor=logits_processor,
            grammar=grammar,
        )
+        if stream is False:
            completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
        return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore

-    assert stream is False  # TODO: support stream mode
-
    def get_grammar(function_call):
        function_body = None
        for function in functions or []:
@ -1968,7 +1969,7 @@ def functionary_v1_v2_chat_handler(

        return grammar

-    def create_completion(stop):
+    def create_completion(prompt, stop, grammar):
        completion = cast(llama_types.Completion, llama.create_completion(
            prompt=prompt,
            temperature=temperature,
@ -1976,7 +1977,7 @@ def functionary_v1_v2_chat_handler(
            top_k=top_k,
            min_p=min_p,
            typical_p=typical_p,
-            stream=False,
+            stream=stream,
            stop=stop,
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
@ -1997,6 +1998,315 @@ def functionary_v1_v2_chat_handler(
    function_calls, function_bodies = [], []
    completion_tokens = 0
    
+    def generate_streaming(tools, functions, function_call, prompt):
+        assert version == "v2", "Streaming for v1 is not supported"
+        
+        chunk_id, chunk_created = None, None
+        
+        # If tool_choice/function_call is provided
+        if isinstance(function_call, dict):
+            prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
+            grammar = get_grammar(function_call["name"])
+            stops = [STOP_TOKEN, FROM_TOKEN]
+            tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
+            completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+            completion_text = ""
+            first = True
+            for chunk in completion:
+                # Yield the tool/function name first
+                if first:
+                    if tools is not None:
+                        func_call_dict = {
+                            "tool_calls": [
+                                {
+                                    "index": 0,
+                                    "id": "call_" + tool_id,
+                                    "type": "function",
+                                    "function": {"name": function_call["name"], "arguments": ""},
+                                }
+                            ]
+                        }
+                    else:
+                        func_call_dict = {"function_call": {"name": function_call["name"], "arguments": ""}}
+                    yield llama_types.CreateChatCompletionStreamResponse(
+                        id="chat" + chunk["id"],
+                        object="chat.completion.chunk",
+                        created=chunk["created"],
+                        model=chunk["model"],
+                        choices=[
+                            {"index": 0, "logprobs": None, "delta": {"role": None, "content": None, **func_call_dict}}
+                        ],
+                    )
+                    first = False
+                if tools is not None:
+                    func_call_dict = {
+                        "tool_calls": [
+                            {
+                                "index": 0,
+                                "id": "call_" + tool_id,
+                                "type": "function",
+                                "function": {
+                                    "name": None,
+                                    "arguments": chunk["choices"][0]["text"].rstrip(),
+                                },
+                            }
+                        ]
+                    }
+                else:
+                    func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
+                if len(chunk["choices"][0]["text"].rstrip()) > 0:
+                    yield llama_types.CreateChatCompletionStreamResponse(
+                        id="chat" + chunk["id"],
+                        object="chat.completion.chunk",
+                        created=chunk["created"],
+                        model=chunk["model"],
+                        choices=[
+                            {
+                                "index": 0,
+                                "logprobs": chunk["choices"][0]["logprobs"],
+                                "delta": {
+                                    "role": None,
+                                    "content": None,
+                                    **func_call_dict,
+                                },
+                            }
+                        ],
+                    )
+            # Yield tool_call/function_call stop message
+            yield llama_types.CreateChatCompletionStreamResponse(
+                id="chat" + chunk["id"],
+                object="chat.completion.chunk",
+                created=chunk["created"],
+                model=chunk["model"],
+                choices=[
+                    {
+                        "index": 0,
+                        "finish_reason": "tool_calls" if tools is not None else "function_call",
+                        "logprobs": None,
+                        "delta": {
+                            "role": None, "content": None, "function_call": None, "tool_calls": None
+                        },
+                    }
+                ],
+            )
+        # If "auto" or no tool_choice/function_call
+        elif isinstance(function_call, str) and function_call == "auto":
+            tool_index = 0
+            while True:
+                # Generate function name first
+                grammar = None
+                stops = CONTENT_TOKEN
+                completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+                completion_text = ""
+                for chunk in completion:
+                    completion_text += chunk["choices"][0]["text"]
+                if chunk_id is None:
+                    chunk_id = chunk["id"]
+                if chunk_created is None:
+                    chunk_created = chunk["created"]
+                function_name = completion_text.strip()
+                if function_name == "all":
+                    prompt += "all\n<|content|>"
+                    # Yield the first empty message for content
+                    yield llama_types.CreateChatCompletionStreamResponse(
+                        id="chat" + chunk_id,
+                        model=chunk["model"],
+                        created=chunk_created,
+                        object="chat.completion.chunk",
+                        choices=[
+                            {
+                                "index": 0,
+                                "delta": {"role": "assistant", "content": ""},
+                                "logprobs": None,
+                                "finish_reason": None,
+                            }
+                        ],
+                    )
+                else:
+                    prompt += f"{function_name}\n<|content|>"
+                    grammar = get_grammar(function_name)
+                    tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
+                    if tools is not None:
+                        func_call_dict = {
+                            "tool_calls": [
+                                {
+                                    "index": tool_index,
+                                    "id": "call_" + tool_id,
+                                    "type": "function",
+                                    "function": {"name": function_name, "arguments": ""},
+                                }
+                            ]
+                        }
+                    else:
+                        func_call_dict = {"function_call": {"name": function_name, "arguments": ""}}
+                    # Stream function name
+                    yield llama_types.CreateChatCompletionStreamResponse(
+                        id="chat" + chunk_id,
+                        object="chat.completion.chunk",
+                        created=chunk_created,
+                        model=chunk["model"],
+                        choices=[
+                            {
+                                "index": 0,
+                                "logprobs": chunk["choices"][0]["logprobs"],
+                                "delta": {
+                                    "role": "assistant",
+                                    "content": None,
+                                    **func_call_dict,
+                                },
+                            }
+                        ],
+                    )
+                # Generate content
+                stops = [RECIPIENT_TOKEN, STOP_TOKEN]
+                completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+                if function_name == "all":
+                    completion_text = ""
+                    stop_sequence, buffer, is_end = "\n<|from|>assistant\n<|recipient|>", [], False
+                    for i, chunk in enumerate(completion):
+                        completion_text += chunk["choices"][0]["text"]
+                        if is_end:
+                            buffer.append(chunk["choices"][0]["text"].strip(" "))
+                            if stop_sequence.startswith("".join(buffer)):
+                                continue
+                            else:
+                                buffer.pop()
+                                while len(buffer) > 0:
+                                    yield llama_types.CreateChatCompletionStreamResponse(
+                                        id="chat" + chunk_id,
+                                        object="chat.completion.chunk",
+                                        created=chunk_created,
+                                        model=chunk["model"],
+                                        choices=[
+                                            {
+                                                "index": 0,
+                                                "logprobs": chunk["choices"][0]["logprobs"],
+                                                "delta": {
+                                                    "role": "assistant", "content": buffer.pop(0)
+                                                },
+                                            }
+                                        ],
+                                    )
+                                is_end = False
+                        elif chunk["choices"][0]["text"] == "\n":
+                            is_end = True
+                            buffer.append(chunk["choices"][0]["text"].strip(" "))
+                            continue
+
+                        if len(buffer) == 0 and len(chunk["choices"][0]["text"]) > 0:
+                            yield llama_types.CreateChatCompletionStreamResponse(
+                                id="chat" + chunk_id,
+                                object="chat.completion.chunk",
+                                created=chunk_created,
+                                model=chunk["model"],
+                                choices=[
+                                    {
+                                        "index": 0,
+                                        "logprobs": chunk["choices"][0]["logprobs"],
+                                        "delta": {
+                                            "role": "assistant",
+                                            "content": chunk["choices"][0]["text"] if i > 0 else chunk["choices"][0]["text"].lstrip()
+                                        },
+                                    }
+                                ],
+                            )
+                    # Check whether the model wants to generate another turn
+                    if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
+                        if completion_text.endswith("\n<|from|>assistant\n"):
+                            cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
+                        elif completion_text.endswith("\n<|from|> assistant\n"):
+                            cleaned_completion_text = completion_text[:-len("\n<|from|> assistant\n")].strip()
+                        else:
+                            cleaned_completion_text = completion_text.strip()
+                        prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
+                    else:
+                        # Yield stop message
+                        yield llama_types.CreateChatCompletionStreamResponse(
+                            id="chat" + chunk_id,
+                            model=chunk["model"],
+                            created=chunk_created,
+                            object="chat.completion.chunk",
+                            choices=[
+                                {
+                                    "index": 0,
+                                    "delta": {},
+                                    "logprobs": None,
+                                    "finish_reason": "stop",
+                                }
+                            ],
+                        )
+                        break
+                else:
+                    # Check whether the model wants to generate another turn
+                    completion_text = ""
+                    for chunk in completion:
+                        completion_text += chunk["choices"][0]["text"]
+                        if len(chunk["choices"][0]["text"].rstrip()) > 0:
+                            if tools is not None:
+                                func_call_dict = {
+                                    "tool_calls": [
+                                        {
+                                            "index": tool_index,
+                                            "id": "call_" + tool_id,
+                                            "type": "function",
+                                            "function": {
+                                                "name": None,
+                                                "arguments": chunk["choices"][0]["text"].rstrip(),
+                                            },
+                                        }
+                                    ]
+                                }
+                            else:
+                                func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
+                            yield llama_types.CreateChatCompletionStreamResponse(
+                                id="chat" + chunk_id,
+                                object="chat.completion.chunk",
+                                created=chunk_created,
+                                model=chunk["model"],
+                                choices=[
+                                    {
+                                        "index": 0,
+                                        "logprobs": chunk["choices"][0]["logprobs"],
+                                        "delta": {
+                                            "role": None,
+                                            "content": None,
+                                            **func_call_dict,
+                                        },
+                                    }
+                                ],
+                            )
+                    prompt += completion_text.strip()
+                    grammar = None
+                    completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+                    completion_text += "".join([chunk["choices"][0]["text"] for chunk in completion])
+                    if ("<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text) and tools is not None:
+                        prompt += "\n<|from|>assistant\n<|recipient|>"
+                        tool_index += 1
+                    else:
+                        # Yield tool_call/function_call stop message
+                        yield llama_types.CreateChatCompletionStreamResponse(
+                            id="chat" + chunk_id,
+                            object="chat.completion.chunk",
+                            created=chunk_created,
+                            model=chunk["model"],
+                            choices=[
+                                {
+                                    "index": 0,
+                                    "finish_reason": "tool_calls" if tools is not None else "function_call",
+                                    "logprobs": None,
+                                    "delta": {
+                                        "role": None, "content": None, "function_call": None, "tool_calls": None
+                                    },
+                                }
+                            ],
+                        )
+                        break
+        
+    if stream is not False:
+        return generate_streaming(
+            tools=tools, functions=functions, function_call=function_call, prompt=prompt
+        )
+    else:
        if version == "v1":
            # If no or "auto" tool_choice/function_call
            if isinstance(function_call, str) and function_call == "auto":