Add JSON mode support. Closes #881

2023-11-08 00:07:16 -05:00 · 2023-11-08 00:07:16 -05:00 · b30b9c338b
commit b30b9c338b
parent 4852a6a39c
4 changed files with 116 additions and 39 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -1901,6 +1901,7 @@ class Llama:
        stream: bool = False,
        stop: Optional[Union[str, List[str]]] = [],
        seed: Optional[int] = None,
        response_format: Optional[ChatCompletionRequestResponseFormat] = None,
        max_tokens: int = 256,
        presence_penalty: float = 0.0,
        frequency_penalty: float = 0.0,
@ -1946,6 +1947,7 @@ class Llama:
            stream=stream,
            stop=stop,
            seed=seed,
            response_format=response_format,
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
            frequency_penalty=frequency_penalty,
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -5,8 +5,9 @@ import ctypes
 import dataclasses
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol
 import llama_cpp.llama_types as llama_types
 import llama_cpp.llama as llama
 import llama_cpp.llama_types as llama_types
 import llama_cpp.llama_grammar as llama_grammar
 class LlamaChatCompletionHandler(Protocol):
@ -25,6 +26,9 @@ class LlamaChatCompletionHandler(Protocol):
        stream: bool = False,
        stop: Optional[Union[str, List[str]]] = [],
        seed: Optional[int] = None,
        response_format: Optional[
            llama_types.ChatCompletionRequestResponseFormat
        ] = None,
        max_tokens: int = 256,
        presence_penalty: float = 0.0,
        frequency_penalty: float = 0.0,
@ -37,7 +41,10 @@ class LlamaChatCompletionHandler(Protocol):
        logits_processor: Optional[llama.LogitsProcessorList] = None,
        grammar: Optional[llama.LlamaGrammar] = None,
        **kwargs,  # type: ignore
-    ) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse]]:
+    ) -> Union[
        llama_types.CreateChatCompletionResponse,
        Iterator[llama_types.CreateChatCompletionStreamResponse],
    ]:
        ...
@ -169,6 +176,7 @@ class ChatFormatterResponse:
 class ChatFormatter(Protocol):
    def __call__(
        self,
        *,
        messages: List[llama_types.ChatCompletionRequestMessage],
        **kwargs: Any,
    ) -> ChatFormatterResponse:
@ -264,17 +272,24 @@ _CHAT_FORMATS: Dict[str, ChatFormatter] = {}
 def register_chat_format(name: str):
    def decorator(f: ChatFormatter):
        def basic_create_chat_completion(
            *,
            llama: llama.Llama,
            messages: List[llama_types.ChatCompletionRequestMessage],
            functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
            function_call: Optional[
-                Union[str, llama_types.ChatCompletionFunctionCall]
+                llama_types.ChatCompletionRequestFunctionCall
            ] = None,
            tools: Optional[List[llama_types.ChatCompletionTool]] = None,
            tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
            temperature: float = 0.2,
            top_p: float = 0.95,
            top_k: int = 40,
            stream: bool = False,
            stop: Optional[Union[str, List[str]]] = [],
            seed: Optional[int] = None,
            response_format: Optional[
                llama_types.ChatCompletionRequestResponseFormat
            ] = None,
            max_tokens: int = 256,
            presence_penalty: float = 0.0,
            frequency_penalty: float = 0.0,
@ -286,8 +301,10 @@ def register_chat_format(name: str):
            model: Optional[str] = None,
            logits_processor: Optional[llama.LogitsProcessorList] = None,
            grammar: Optional[llama.LlamaGrammar] = None,
            **kwargs,  # type: ignore
        ) -> Union[
-            llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]
+            llama_types.CreateChatCompletionResponse,
            Iterator[llama_types.CreateChatCompletionStreamResponse],
        ]:
            result = f(
                messages=messages,
@ -299,6 +316,10 @@ def register_chat_format(name: str):
                stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
                rstop = result.stop if isinstance(result.stop, list) else [result.stop]
                stop = stop + rstop
            if response_format is not None and response_format["type"] == "json_object":
                print("hello world")
                grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
            completion_or_chunks = llama.create_completion(
                prompt=prompt,
@ -307,6 +328,7 @@ def register_chat_format(name: str):
                top_k=top_k,
                stream=stream,
                stop=stop,
                seed=seed,
                max_tokens=max_tokens,
                presence_penalty=presence_penalty,
                frequency_penalty=frequency_penalty,
@ -319,7 +341,7 @@ def register_chat_format(name: str):
                logits_processor=logits_processor,
                grammar=grammar,
            )
-            return _convert_completion_to_chat(completion_or_chunks, stream=stream)  # type: ignore
+            return _convert_completion_to_chat(completion_or_chunks, stream=stream)
        register_chat_completion_handler(name)(basic_create_chat_completion)
        return f
@ -727,7 +749,7 @@ def functionary_chat_handler(
    assert "usage" in completion
    assert isinstance(function_call, str)
-    assert stream is False # TODO: support stream mode
+    assert stream is False  # TODO: support stream mode
    return llama_types.CreateChatCompletionResponse(
        id="chat" + completion["id"],
@ -759,7 +781,9 @@ class Llava15ChatHandler:
        self._llava_cpp = llava_cpp
        self.clip_model_path = clip_model_path
-        self.clip_ctx = self._llava_cpp.clip_model_load(self.clip_model_path.encode(), 0)
+        self.clip_ctx = self._llava_cpp.clip_model_load(
            self.clip_model_path.encode(), 0
        )
    def __del__(self):
        if self.clip_ctx is not None:
@ -805,12 +829,21 @@ class Llava15ChatHandler:
        logits_processor: Optional[llama.LogitsProcessorList] = None,
        grammar: Optional[llama.LlamaGrammar] = None,
        **kwargs,  # type: ignore
-    ) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse]]:
+    ) -> Union[
-        assert llama.context_params.logits_all is True # BUG: logits_all=True is required for llava
+        llama_types.CreateChatCompletionResponse,
        Iterator[llama_types.CreateChatCompletionStreamResponse],
    ]:
        assert (
            llama.context_params.logits_all is True
        )  # BUG: logits_all=True is required for llava
        assert self.clip_ctx is not None
        system_prompt = _get_system_message(messages)
-        system_prompt = system_prompt if system_prompt != "" else "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
+        system_prompt = (
-        system_prompt =  "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
+            system_prompt
            if system_prompt != ""
            else "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
        )
        system_prompt = "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
        user_role = "\nUSER:"
        assistant_role = "\nASSISTANT:"
        llama.reset()
@ -818,51 +851,86 @@ class Llava15ChatHandler:
        for message in messages:
            if message["role"] == "user" and message["content"] is not None:
                if isinstance(message["content"], str):
-                    llama.eval(llama.tokenize(f"{user_role} {message['content']}".encode("utf8"), add_bos=False))
+                    llama.eval(
                        llama.tokenize(
                            f"{user_role} {message['content']}".encode("utf8"),
                            add_bos=False,
                        )
                    )
                else:
                    assert isinstance(message["content"], list)
-                    llama.eval(llama.tokenize(f"{user_role} ".encode("utf8"), add_bos=False))
+                    llama.eval(
                        llama.tokenize(f"{user_role} ".encode("utf8"), add_bos=False)
                    )
                    for content in message["content"]:
                        if content["type"] == "text":
-                            llama.eval(llama.tokenize(f"{content['text']}".encode("utf8"), add_bos=False))
+                            llama.eval(
                                llama.tokenize(
                                    f"{content['text']}".encode("utf8"), add_bos=False
                                )
                            )
                        if content["type"] == "image_url":
-                            image_bytes = self.load_image(content["image_url"]["url"]) if isinstance(content["image_url"], dict) else self.load_image(content["image_url"])
+                            image_bytes = (
                                self.load_image(content["image_url"]["url"])
                                if isinstance(content["image_url"], dict)
                                else self.load_image(content["image_url"])
                            )
                            import array
-                            data_array =  array.array('B', image_bytes)
+
-                            c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array)
+                            data_array = array.array("B", image_bytes)
-                            embed = self._llava_cpp.llava_image_embed_make_with_bytes(ctx_clip=self.clip_ctx, n_threads=llama.context_params.n_threads, image_bytes=c_ubyte_ptr, image_bytes_length=len(image_bytes))
+                            c_ubyte_ptr = (
                                ctypes.c_ubyte * len(data_array)
                            ).from_buffer(data_array)
                            embed = self._llava_cpp.llava_image_embed_make_with_bytes(
                                ctx_clip=self.clip_ctx,
                                n_threads=llama.context_params.n_threads,
                                image_bytes=c_ubyte_ptr,
                                image_bytes_length=len(image_bytes),
                            )
                            # image_bytes_p = (ctypes.c_uint8 * len(image_bytes)).from_buffer_copy(image_bytes)
                            # embed = self._llava_cpp.llava_image_embed_make_with_bytes(ctx_clip=self.clip_ctx, n_threads=1, image_bytes=image_bytes_p, image_bytes_length=len(image_bytes))
                            try:
                                n_past = ctypes.c_int(llama.n_tokens)
                                n_past_p = ctypes.pointer(n_past)
-                                self._llava_cpp.llava_eval_image_embed(ctx_llama=llama.ctx, embed=embed, n_batch=llama.n_batch, n_past=n_past_p)
+                                self._llava_cpp.llava_eval_image_embed(
                                    ctx_llama=llama.ctx,
                                    embed=embed,
                                    n_batch=llama.n_batch,
                                    n_past=n_past_p,
                                )
                                assert llama.n_ctx() >= n_past.value
                                llama.n_tokens = n_past.value
                            finally:
                                self._llava_cpp.llava_image_embed_free(embed)
            if message["role"] == "assistant" and message["content"] is not None:
-                llama.eval(llama.tokenize(f"ASSISTANT: {message['content']}".encode("utf8"), add_bos=False))
+                llama.eval(
                    llama.tokenize(
                        f"ASSISTANT: {message['content']}".encode("utf8"), add_bos=False
                    )
                )
        llama.eval(llama.tokenize(f"{assistant_role}".encode("utf8"), add_bos=False))
        prompt = llama._input_ids.tolist()
-        return _convert_completion_to_chat(llama.create_completion(
+        return _convert_completion_to_chat(
-            prompt=prompt,
+            llama.create_completion(
-            temperature=temperature,
+                prompt=prompt,
-            top_p=top_p,
+                temperature=temperature,
-            top_k=top_k,
+                top_p=top_p,
                top_k=top_k,
                stream=stream,
                stop=stop,
                max_tokens=max_tokens,
                presence_penalty=presence_penalty,
                frequency_penalty=frequency_penalty,
                repeat_penalty=repeat_penalty,
                tfs_z=tfs_z,
                mirostat_mode=mirostat_mode,
                mirostat_tau=mirostat_tau,
                mirostat_eta=mirostat_eta,
                model=model,
                logits_processor=logits_processor,
                grammar=grammar,
            ),
            stream=stream,
-            stop=stop,
+        )
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
            frequency_penalty=frequency_penalty,
            repeat_penalty=repeat_penalty,
            tfs_z=tfs_z,
            mirostat_mode=mirostat_mode,
            mirostat_tau=mirostat_tau,
            mirostat_eta=mirostat_eta,
            model=model,
            logits_processor=logits_processor,
            grammar=grammar,
        ), stream=stream)
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@ -152,6 +152,10 @@ class ChatCompletionFunctionCallOption(TypedDict):
    name: str
 class ChatCompletionRequestResponseFormat(TypedDict):
    type: Literal["text", "json_object"]
 class ChatCompletionRequestMessageContentPartText(TypedDict):
    type: Literal["text"]
    text: str
@ -241,7 +245,7 @@ ChatCompletionRequestFunctionCall = Union[
    Literal["none", "auto"], ChatCompletionRequestFunctionCallOption
 ]
-ChatCompletionFunctionParameters = Dict[str, JsonType] # TODO: make this more specific
+ChatCompletionFunctionParameters = Dict[str, JsonType]  # TODO: make this more specific
 class ChatCompletionToolFunction(TypedDict):
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@ -792,6 +792,9 @@ class CreateChatCompletionRequest(BaseModel):
    frequency_penalty: Optional[float] = frequency_penalty_field
    logit_bias: Optional[Dict[str, float]] = Field(None)
    seed: Optional[int] = Field(None)
    response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field(
        default=None,
    )
    # ignored or currently unsupported
    model: Optional[str] = model_field