Merge https://github.com/abetlen/llama-cpp-python

2024-04-06 16:34:43 +05:30 · 2024-04-06 16:34:43 +05:30 · 0078e0f1cf
commit 0078e0f1cf
parent 8b9cd38c0d 08b16afe11
6 changed files with 41 additions and 24 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

+## [0.2.60]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@75cd4c77292034ecec587ecb401366f57338f7c0
+- fix: Always embed metal library by @abetlen in b3bfea6dbfb6ed9ce18f9a2723e0a9e4bd1da7ad
+- fix: missing logprobs in response, incorrect response type for functionary by @abetlen in 1ae3abbcc3af7f4a25a3ffc40b246f18039565e8
+- fix(docs): incorrect tool_choice example by @CISC in #1330
+
 ## [0.2.59]

 - feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -18,7 +18,7 @@ if (LLAMA_BUILD)
        set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
    endif()

-    if (APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+    if (APPLE)
        set(LLAMA_METAL_EMBED_LIBRARY "On" CACHE BOOL "llama: embed metal library" FORCE)
    endif()

--- a/README.md
+++ b/README.md
@ -458,12 +458,12 @@ The high-level API supports OpenAI compatible function and tool calling. This is
          }
        }
      }],
-      tool_choice=[{
+      tool_choice={
        "type": "function",
        "function": {
          "name": "UserDetail"
        }
-      }]
+      }
 )
 ```

--- a/llama_cpp/init.py
+++ b/llama_cpp/init.py
@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *

-__version__ = "0.2.59"
+__version__ = "0.2.60"
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -6,7 +6,7 @@ import ctypes
 import dataclasses
 import random
 import string
-from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol
+from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol, cast

 import jinja2

@ -338,6 +338,7 @@ def _convert_completion_to_chat_function(
                            }
                        ],
                    },
+                    "logprobs": None,
                    "finish_reason": "tool_calls",
                }
            ],
@ -1191,7 +1192,6 @@ def format_mistral_instruct(
        elif (
            message["role"] == "assistant"
            and message["content"] is not None
-            and isinstance(message["content"], str)
        ):
            prompt += " [/INST]" + message["content"] + eos
    prompt += " [/INST]"
@ -1263,7 +1263,7 @@ def format_gemma(
    **kwargs: Any,
 ) -> ChatFormatterResponse:
    system_message = _get_system_message(messages)
-    if system_message is not None and system_message != "":
+    if system_message != "":
        logger.debug(
            "`role='system'` messages are not allowed on Google's Gemma models."
        )
@ -1628,6 +1628,7 @@ def functionary_chat_handler(
                        }
                    ],
                },
+                "logprobs": None,
                "finish_reason": "tool_calls",
            }
        ],
@ -1909,14 +1910,14 @@ def functionary_v1_v2_chat_handler(
        return grammar

    def create_completion(stop):
-        completion: llama_types.Completion = llama.create_completion(
+        completion = cast(llama_types.Completion, llama.create_completion(
            prompt=prompt,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            min_p=min_p,
            typical_p=typical_p,
-            stream=stream,
+            stream=False,
            stop=stop,
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
@ -1929,7 +1930,7 @@ def functionary_v1_v2_chat_handler(
            model=model,
            logits_processor=logits_processor,
            grammar=grammar,
-        )
+        ))

        return completion

@ -2050,7 +2051,7 @@ def functionary_v1_v2_chat_handler(
    assert "usage" in completion
    assert len(function_calls) == len(function_bodies)

-    tool_calls = []
+    tool_calls: List[llama_types.ChatCompletionMessageToolCall] = []
    for function_call, function_body in zip(function_calls, function_bodies):
        tool_calls.append(
            {
@ -2070,6 +2071,12 @@ def functionary_v1_v2_chat_handler(
        )

    # TODO: support stream mode
+    function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = { 
+        "function_call": {
+            "name": tool_calls[0]["function"]["name"],
+            "arguments": tool_calls[0]["function"]["arguments"],
+        }
+    } if len(tool_calls) == 1 else {}
    return llama_types.CreateChatCompletionResponse(
        id="chat" + completion["id"],
        object="chat.completion",
@ -2078,14 +2085,12 @@ def functionary_v1_v2_chat_handler(
        choices=[
            {
                "index": 0,
+                "logprobs": None,
                "message": {
                    "role": "assistant",
                    "content": None if content == "" else content,
-                    "function_call": {
-                        "name": tool_calls[0]["function"]["name"],
-                        "arguments": tool_calls[0]["function"]["arguments"],
-                    } if len(tool_calls) > 0 else None,
-                    "tool_calls": tool_calls if len(tool_calls) > 0 else None,
+                    "tool_calls": tool_calls,
+                    **function_call_dict,
                },
                "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
            }
@ -2565,8 +2570,8 @@ def chatml_function_calling(
    tool_name = text[len("functions.") :]
    tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
    if not stream:
-        completions = []
-        completions_tool_name = []
+        completions: List[llama_types.CreateCompletionResponse] = []
+        completions_tool_name: List[str] = []
        while tool is not None:
            prompt += f"functions.{tool_name}:\n"
            try:
@ -2603,6 +2608,7 @@ def chatml_function_calling(
                logits_processor=logits_processor,
                grammar=grammar,
            )
+            completion_or_chunks = cast(llama_types.CreateCompletionResponse, completion_or_chunks)
            completions.append(completion_or_chunks)
            completions_tool_name.append(tool_name)
            prompt += completion_or_chunks["choices"][0]["text"]
@ -2631,6 +2637,7 @@ def chatml_function_calling(
                    follow_up_gbnf_tool_grammar, verbose=llama.verbose
                ),
            )
+            response = cast(llama_types.CreateCompletionResponse, response)

            tool_name = response["choices"][0]["text"][len("functions.") :]
            tool = next(
@ -2638,7 +2645,7 @@ def chatml_function_calling(
            )

        # Merge completions
-        function_call = { 
+        function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = { 
            "function_call": {
                "name": tool_name,
                "arguments": completions[0]["choices"][0]["text"],
@ -2653,6 +2660,7 @@ def chatml_function_calling(
                {
                    "finish_reason": "tool_calls",
                    "index": 0,
+                    "logprobs": None,
                    "message": {
                        "role": "assistant",
                        "content": None,
@ -2673,20 +2681,22 @@ def chatml_function_calling(
                                zip(completions_tool_name, completions)
                            )
                        ],
-                        **function_call
+                        **function_call_dict
                    },
                }
            ],
            "usage": {
                "completion_tokens": sum(
-                    completion["usage"]["completion_tokens"]
+                    completion["usage"]["completion_tokens"] if "usage" in completion else 0
                    for completion in completions
                ),
                "prompt_tokens": sum(
-                    completion["usage"]["prompt_tokens"] for completion in completions
+                    completion["usage"]["prompt_tokens"] if "usage" in completion else 0
+                    for completion in completions
                ),
                "total_tokens": sum(
-                    completion["usage"]["total_tokens"] for completion in completions
+                    completion["usage"]["total_tokens"] if "usage" in completion else 0
+                    for completion in completions
                ),
            },
        }
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 60cdf40cc32f0ad4cb11e0ca8fd38f3b93d8d640
+Subproject commit 75cd4c77292034ecec587ecb401366f57338f7c0