feat: Implement streaming for Functionary v2 + Bug fixes (#1419)

* set up streaming for v2 * assert v2 streaming, fix tool_call vs function_call * fix streaming with tool_choice/function_call * make functions return 1 function call only when 'auto' * fix --------- Co-authored-by: Andrei <abetlen@gmail.com>
2024-05-04 22:11:20 +08:00 · 2024-05-04 22:11:20 +08:00 · 1f56c648c3
commit 1f56c648c3
parent f9b7221c8f
1 changed files with 443 additions and 133 deletions
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -1894,6 +1894,8 @@ def functionary_v1_v2_chat_handler(
        function_call = (
            tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
        )
    elif function_call is not None:
        pass
    else:
        function_call = "auto"
@ -1930,11 +1932,10 @@ def functionary_v1_v2_chat_handler(
            logits_processor=logits_processor,
            grammar=grammar,
        )
        if stream is False:
            completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
        return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
    assert stream is False  # TODO: support stream mode
    def get_grammar(function_call):
        function_body = None
        for function in functions or []:
@ -1968,7 +1969,7 @@ def functionary_v1_v2_chat_handler(
        return grammar
-    def create_completion(stop):
+    def create_completion(prompt, stop, grammar):
        completion = cast(llama_types.Completion, llama.create_completion(
            prompt=prompt,
            temperature=temperature,
@ -1976,7 +1977,7 @@ def functionary_v1_v2_chat_handler(
            top_k=top_k,
            min_p=min_p,
            typical_p=typical_p,
-            stream=False,
+            stream=stream,
            stop=stop,
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
@ -1997,6 +1998,315 @@ def functionary_v1_v2_chat_handler(
    function_calls, function_bodies = [], []
    completion_tokens = 0
    def generate_streaming(tools, functions, function_call, prompt):
        assert version == "v2", "Streaming for v1 is not supported"
        chunk_id, chunk_created = None, None
        # If tool_choice/function_call is provided
        if isinstance(function_call, dict):
            prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
            grammar = get_grammar(function_call["name"])
            stops = [STOP_TOKEN, FROM_TOKEN]
            tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
            completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
            completion_text = ""
            first = True
            for chunk in completion:
                # Yield the tool/function name first
                if first:
                    if tools is not None:
                        func_call_dict = {
                            "tool_calls": [
                                {
                                    "index": 0,
                                    "id": "call_" + tool_id,
                                    "type": "function",
                                    "function": {"name": function_call["name"], "arguments": ""},
                                }
                            ]
                        }
                    else:
                        func_call_dict = {"function_call": {"name": function_call["name"], "arguments": ""}}
                    yield llama_types.CreateChatCompletionStreamResponse(
                        id="chat" + chunk["id"],
                        object="chat.completion.chunk",
                        created=chunk["created"],
                        model=chunk["model"],
                        choices=[
                            {"index": 0, "logprobs": None, "delta": {"role": None, "content": None, **func_call_dict}}
                        ],
                    )
                    first = False
                if tools is not None:
                    func_call_dict = {
                        "tool_calls": [
                            {
                                "index": 0,
                                "id": "call_" + tool_id,
                                "type": "function",
                                "function": {
                                    "name": None,
                                    "arguments": chunk["choices"][0]["text"].rstrip(),
                                },
                            }
                        ]
                    }
                else:
                    func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
                if len(chunk["choices"][0]["text"].rstrip()) > 0:
                    yield llama_types.CreateChatCompletionStreamResponse(
                        id="chat" + chunk["id"],
                        object="chat.completion.chunk",
                        created=chunk["created"],
                        model=chunk["model"],
                        choices=[
                            {
                                "index": 0,
                                "logprobs": chunk["choices"][0]["logprobs"],
                                "delta": {
                                    "role": None,
                                    "content": None,
                                    **func_call_dict,
                                },
                            }
                        ],
                    )
            # Yield tool_call/function_call stop message
            yield llama_types.CreateChatCompletionStreamResponse(
                id="chat" + chunk["id"],
                object="chat.completion.chunk",
                created=chunk["created"],
                model=chunk["model"],
                choices=[
                    {
                        "index": 0,
                        "finish_reason": "tool_calls" if tools is not None else "function_call",
                        "logprobs": None,
                        "delta": {
                            "role": None, "content": None, "function_call": None, "tool_calls": None
                        },
                    }
                ],
            )
        # If "auto" or no tool_choice/function_call
        elif isinstance(function_call, str) and function_call == "auto":
            tool_index = 0
            while True:
                # Generate function name first
                grammar = None
                stops = CONTENT_TOKEN
                completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
                completion_text = ""
                for chunk in completion:
                    completion_text += chunk["choices"][0]["text"]
                if chunk_id is None:
                    chunk_id = chunk["id"]
                if chunk_created is None:
                    chunk_created = chunk["created"]
                function_name = completion_text.strip()
                if function_name == "all":
                    prompt += "all\n<|content|>"
                    # Yield the first empty message for content
                    yield llama_types.CreateChatCompletionStreamResponse(
                        id="chat" + chunk_id,
                        model=chunk["model"],
                        created=chunk_created,
                        object="chat.completion.chunk",
                        choices=[
                            {
                                "index": 0,
                                "delta": {"role": "assistant", "content": ""},
                                "logprobs": None,
                                "finish_reason": None,
                            }
                        ],
                    )
                else:
                    prompt += f"{function_name}\n<|content|>"
                    grammar = get_grammar(function_name)
                    tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
                    if tools is not None:
                        func_call_dict = {
                            "tool_calls": [
                                {
                                    "index": tool_index,
                                    "id": "call_" + tool_id,
                                    "type": "function",
                                    "function": {"name": function_name, "arguments": ""},
                                }
                            ]
                        }
                    else:
                        func_call_dict = {"function_call": {"name": function_name, "arguments": ""}}
                    # Stream function name
                    yield llama_types.CreateChatCompletionStreamResponse(
                        id="chat" + chunk_id,
                        object="chat.completion.chunk",
                        created=chunk_created,
                        model=chunk["model"],
                        choices=[
                            {
                                "index": 0,
                                "logprobs": chunk["choices"][0]["logprobs"],
                                "delta": {
                                    "role": "assistant",
                                    "content": None,
                                    **func_call_dict,
                                },
                            }
                        ],
                    )
                # Generate content
                stops = [RECIPIENT_TOKEN, STOP_TOKEN]
                completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
                if function_name == "all":
                    completion_text = ""
                    stop_sequence, buffer, is_end = "\n<|from|>assistant\n<|recipient|>", [], False
                    for i, chunk in enumerate(completion):
                        completion_text += chunk["choices"][0]["text"]
                        if is_end:
                            buffer.append(chunk["choices"][0]["text"].strip(" "))
                            if stop_sequence.startswith("".join(buffer)):
                                continue
                            else:
                                buffer.pop()
                                while len(buffer) > 0:
                                    yield llama_types.CreateChatCompletionStreamResponse(
                                        id="chat" + chunk_id,
                                        object="chat.completion.chunk",
                                        created=chunk_created,
                                        model=chunk["model"],
                                        choices=[
                                            {
                                                "index": 0,
                                                "logprobs": chunk["choices"][0]["logprobs"],
                                                "delta": {
                                                    "role": "assistant", "content": buffer.pop(0)
                                                },
                                            }
                                        ],
                                    )
                                is_end = False
                        elif chunk["choices"][0]["text"] == "\n":
                            is_end = True
                            buffer.append(chunk["choices"][0]["text"].strip(" "))
                            continue
                        if len(buffer) == 0 and len(chunk["choices"][0]["text"]) > 0:
                            yield llama_types.CreateChatCompletionStreamResponse(
                                id="chat" + chunk_id,
                                object="chat.completion.chunk",
                                created=chunk_created,
                                model=chunk["model"],
                                choices=[
                                    {
                                        "index": 0,
                                        "logprobs": chunk["choices"][0]["logprobs"],
                                        "delta": {
                                            "role": "assistant",
                                            "content": chunk["choices"][0]["text"] if i > 0 else chunk["choices"][0]["text"].lstrip()
                                        },
                                    }
                                ],
                            )
                    # Check whether the model wants to generate another turn
                    if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
                        if completion_text.endswith("\n<|from|>assistant\n"):
                            cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
                        elif completion_text.endswith("\n<|from|> assistant\n"):
                            cleaned_completion_text = completion_text[:-len("\n<|from|> assistant\n")].strip()
                        else:
                            cleaned_completion_text = completion_text.strip()
                        prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
                    else:
                        # Yield stop message
                        yield llama_types.CreateChatCompletionStreamResponse(
                            id="chat" + chunk_id,
                            model=chunk["model"],
                            created=chunk_created,
                            object="chat.completion.chunk",
                            choices=[
                                {
                                    "index": 0,
                                    "delta": {},
                                    "logprobs": None,
                                    "finish_reason": "stop",
                                }
                            ],
                        )
                        break
                else:
                    # Check whether the model wants to generate another turn
                    completion_text = ""
                    for chunk in completion:
                        completion_text += chunk["choices"][0]["text"]
                        if len(chunk["choices"][0]["text"].rstrip()) > 0:
                            if tools is not None:
                                func_call_dict = {
                                    "tool_calls": [
                                        {
                                            "index": tool_index,
                                            "id": "call_" + tool_id,
                                            "type": "function",
                                            "function": {
                                                "name": None,
                                                "arguments": chunk["choices"][0]["text"].rstrip(),
                                            },
                                        }
                                    ]
                                }
                            else:
                                func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
                            yield llama_types.CreateChatCompletionStreamResponse(
                                id="chat" + chunk_id,
                                object="chat.completion.chunk",
                                created=chunk_created,
                                model=chunk["model"],
                                choices=[
                                    {
                                        "index": 0,
                                        "logprobs": chunk["choices"][0]["logprobs"],
                                        "delta": {
                                            "role": None,
                                            "content": None,
                                            **func_call_dict,
                                        },
                                    }
                                ],
                            )
                    prompt += completion_text.strip()
                    grammar = None
                    completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
                    completion_text += "".join([chunk["choices"][0]["text"] for chunk in completion])
                    if ("<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text) and tools is not None:
                        prompt += "\n<|from|>assistant\n<|recipient|>"
                        tool_index += 1
                    else:
                        # Yield tool_call/function_call stop message
                        yield llama_types.CreateChatCompletionStreamResponse(
                            id="chat" + chunk_id,
                            object="chat.completion.chunk",
                            created=chunk_created,
                            model=chunk["model"],
                            choices=[
                                {
                                    "index": 0,
                                    "finish_reason": "tool_calls" if tools is not None else "function_call",
                                    "logprobs": None,
                                    "delta": {
                                        "role": None, "content": None, "function_call": None, "tool_calls": None
                                    },
                                }
                            ],
                        )
                        break
    if stream is not False:
        return generate_streaming(
            tools=tools, functions=functions, function_call=function_call, prompt=prompt
        )
    else:
        if version == "v1":
            # If no or "auto" tool_choice/function_call
            if isinstance(function_call, str) and function_call == "auto":