Multimodal Support (Llava 1.5) (#821)

* llava v1.5 integration * Point llama.cpp to fork * Add llava shared library target * Fix type * Update llama.cpp * Add llava api * Revert changes to llama and llama_cpp * Update llava example * Add types for new gpt-4-vision-preview api * Fix typo * Update llama.cpp * Update llama_types to match OpenAI v1 API * Update ChatCompletionFunction type * Reorder request parameters * More API type fixes * Even More Type Updates * Add parameter for custom chat_handler to Llama class * Fix circular import * Convert to absolute imports * Fix * Fix pydantic Jsontype bug * Accept list of prompt tokens in create_completion * Add llava1.5 chat handler * Add Multimodal notebook * Clean up examples * Add server docs --------- Co-authored-by: Andrei Betlen <abetlen@gmail.com>
2023-11-08 04:48:51 +01:00 · 2023-11-08 04:48:51 +01:00 · aab74f0b2b
commit aab74f0b2b
parent 56171cf7bf
10 changed files with 796 additions and 102 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -41,4 +41,23 @@ if (LLAMA_BUILD)
        FILES $<TARGET_RUNTIME_DLLS:llama>
        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
    )
    add_subdirectory(vendor/llama.cpp/examples/llava)
    set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
    install(
        TARGETS llava_shared
        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
    )
    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
    install(
        TARGETS llava_shared
        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
    )
 endif()
--- a/docs/server.md
+++ b/docs/server.md
@ -0,0 +1,77 @@
 # OpenAI Compatible Server
 `llama-cpp-python` offers an OpenAI API compatible web server.
 This web server can be used to serve local models and easily connect them to existing clients.
 ## Setup
 ### Installation
 The server can be installed by running the following command:
 ```bash
 pip install llama-cpp-python[server]
 ```
 ### Running the server
 The server can then be started by running the following command:
 ```bash
 python3 -m llama_cpp.server --model <model_path>
 ```
 ### Server options
 For a full list of options, run:
 ```bash
 python3 -m llama_cpp.server --help
 ```
 NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.
 ## Guides
 ### Multi-modal Models
 `llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to
 read information from both text and images.
 You'll first need to download one of the available multi-modal models in GGUF format:
 - [llava1.5 7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
 - [llava1.5 13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
 Then when you run the server you'll need to also specify the path to the clip model used for image embedding
 ```bash
 python3 -m llama_cpp.server --model <model_path> --clip-model-path <clip_model_path>
 ```
 Then you can just use the OpenAI API as normal
 ```python3
 from openai import OpenAI
 client = OpenAI(base_url="http://<host>:<port>/v1", api_key="sk-xxx")
 response = client.chat.completions.create(
    model="gpt-4-vision-preview",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "<image_url>"
                    },
                },
                {"type": "text", "text": "What does the image say"},
            ],
        }
    ],
 )
 print(response)
 ```
--- a/examples/notebooks/Multimodal.ipynb
+++ b/examples/notebooks/Multimodal.ipynb
@ -0,0 +1,84 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ChatCompletion(id='chatcmpl-65a710ba-41d1-4d0a-a124-a44b2b4a0189', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content=' The image reads \"LlamaC++.\"', role='assistant', function_call=None, tool_calls=None))], created=1699413274, model='gpt-4-vision-preview', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=10, prompt_tokens=624, total_tokens=634))\n"
     ]
    }
   ],
   "source": [
    "from openai import OpenAI\n",
    "\n",
    "import urllib.request\n",
    "import base64\n",
    "\n",
    "def get_data_url(url):\n",
    "    return \"data:image/png;base64,\" + base64.b64encode(urllib.request.urlopen(url).read()).decode(\"utf-8\")\n",
    "\n",
    "client = OpenAI(base_url=\"http://100.64.159.73:8000/v1\", api_key=\"sk-1234\")\n",
    "response = client.chat.completions.create(\n",
    "    model=\"gpt-4-vision-preview\",\n",
    "    messages=[\n",
    "        {\n",
    "            \"role\": \"user\",\n",
    "            \"content\": [\n",
    "                {\n",
    "                    \"type\": \"image_url\",\n",
    "                    \"image_url\": {\n",
    "                        \"url\": get_data_url(\"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\"),\n",
    "                        # \"url\": \"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\",\n",
    "                    },\n",
    "                },\n",
    "                {\"type\": \"text\", \"text\": \"What does the image say\"},\n",
    "            ],\n",
    "        }\n",
    "    ],\n",
    ")\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5+"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -21,9 +21,9 @@ from collections import deque, OrderedDict
 import diskcache
 import ctypes
 from . import llama_cpp
 from .llama_types import *
 from .llama_grammar import LlamaGrammar
 import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama_chat_format as llama_chat_format
 import numpy as np
@ -752,6 +752,7 @@ class Llama:
        numa: bool = False,
        # Chat Format Params
        chat_format: str = "llama-2",
        chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
        # Misc
        verbose: bool = True,
        # Extra Params
@ -784,6 +785,7 @@ class Llama:
            lora_path: Path to a LoRA file to apply to the model.
            numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
            chat_format: String specifying the chat format to use when calling create_chat_completion.
            chat_handler: Optional chat handler to use when calling create_chat_completion.
            verbose: Print verbose output to stderr.
        Raises:
@ -910,6 +912,7 @@ class Llama:
            print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
        self.chat_format = chat_format
        self.chat_handler = chat_handler
        self._n_vocab = self.n_vocab()
        self._n_ctx = self.n_ctx()
@ -1231,7 +1234,7 @@ class Llama:
        else:
            inputs = input
-        data: List[EmbeddingData] = []
+        data: List[Embedding] = []
        total_tokens = 0
        for index, input in enumerate(inputs):
            tokens = self.tokenize(input.encode("utf-8"), special=True)
@ -1276,7 +1279,7 @@ class Llama:
    def _create_completion(
        self,
-        prompt: str,
+        prompt: Union[str, List[int]],
        suffix: Optional[str] = None,
        max_tokens: int = 16,
        temperature: float = 0.8,
@ -1297,7 +1300,9 @@ class Llama:
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        logits_processor: Optional[LogitsProcessorList] = None,
        grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
+    ) -> Union[
        Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
    ]:
        assert self._ctx is not None
        assert suffix is None or suffix.__class__ is str
@ -1309,7 +1314,7 @@ class Llama:
            self.tokenize(prompt.encode("utf-8"), special=True)
            if prompt != ""
            else [self.token_bos()]
-        )
+        ) if isinstance(prompt, str) else prompt
        text: bytes = b""
        returned_tokens: int = 0
        stop = (
@ -1322,7 +1327,7 @@ class Llama:
        if len(prompt_tokens) >= self._n_ctx:
            raise ValueError(
-                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self._ctx)}"
+                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
            )
        if max_tokens <= 0:
@ -1732,7 +1737,7 @@ class Llama:
    def create_completion(
        self,
-        prompt: str,
+        prompt: Union[str, List[int]],
        suffix: Optional[str] = None,
        max_tokens: int = 128,
        temperature: float = 0.8,
@ -1753,7 +1758,7 @@ class Llama:
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        logits_processor: Optional[LogitsProcessorList] = None,
        grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Completion, Iterator[CompletionChunk]]:
+    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
        """Generate text from a prompt.
        Args:
@ -1800,7 +1805,7 @@ class Llama:
            grammar=grammar,
        )
        if stream:
-            chunks: Iterator[CompletionChunk] = completion_or_chunks
+            chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
            return chunks
        completion: Completion = next(completion_or_chunks)  # type: ignore
        return completion
@ -1828,7 +1833,7 @@ class Llama:
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        logits_processor: Optional[LogitsProcessorList] = None,
        grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[Completion, Iterator[CompletionChunk]]:
+    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
        """Generate text from a prompt.
        Args:
@ -1879,7 +1884,9 @@ class Llama:
        self,
        messages: List[ChatCompletionRequestMessage],
        functions: Optional[List[ChatCompletionFunction]] = None,
-        function_call: Optional[Union[str, ChatCompletionFunctionCall]] = None,
+        function_call: Optional[ChatCompletionRequestFunctionCall] = None,
        tools: Optional[List[ChatCompletionTool]] = None,
        tool_choice: Optional[ChatCompletionToolChoiceOption] = None,
        temperature: float = 0.2,
        top_p: float = 0.95,
        top_k: int = 40,
@ -1896,7 +1903,9 @@ class Llama:
        model: Optional[str] = None,
        logits_processor: Optional[LogitsProcessorList] = None,
        grammar: Optional[LlamaGrammar] = None,
-    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+    ) -> Union[
        CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
    ]:
        """Generate a chat completion from a list of messages.
        Args:
@ -1912,12 +1921,16 @@ class Llama:
        Returns:
            Generated chat completion or a stream of chat completion chunks.
        """
-        handler = llama_chat_format.get_chat_completion_handler(self.chat_format)
+        handler = self.chat_handler or llama_chat_format.get_chat_completion_handler(
            self.chat_format
        )
        return handler(
-            self,
+            llama=self,
            messages=messages,
            functions=functions,
            function_call=function_call,
            tools=tools,
            tool_choice=tool_choice,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
@ -1974,6 +1987,7 @@ class Llama:
            numa=self.numa,
            # Chat Format Params
            chat_format=self.chat_format,
            chat_handler=self.chat_handler,
            # Misc
            verbose=self.verbose,
        )
@ -2015,6 +2029,7 @@ class Llama:
            numa=state["numa"],
            # Chat Format Params
            chat_format=state["chat_format"],
            chat_handler=state["chat_handler"],
            # Misc
            verbose=state["verbose"],
        )
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -1,22 +1,24 @@
 from __future__ import annotations
 import os
 import ctypes
 import dataclasses
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol
-from . import llama_types
+import llama_cpp.llama_types as llama_types
-from . import llama
+import llama_cpp.llama as llama
 class LlamaChatCompletionHandler(Protocol):
    def __call__(
        self,
        *,
        llama: llama.Llama,
        messages: List[llama_types.ChatCompletionRequestMessage],
        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-        function_call: Optional[
+        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
-            Union[str, llama_types.ChatCompletionFunctionCall]
+        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
-        ] = None,
+        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
        temperature: float = 0.2,
        top_p: float = 0.95,
        top_k: int = 40,
@ -33,7 +35,8 @@ class LlamaChatCompletionHandler(Protocol):
        model: Optional[str] = None,
        logits_processor: Optional[llama.LogitsProcessorList] = None,
        grammar: Optional[llama.LlamaGrammar] = None,
-    ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+        **kwargs,  # type: ignore
    ) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse]]:
        ...
@ -199,7 +202,7 @@ def _convert_text_completion_to_chat(
 def _convert_text_completion_chunks_to_chat(
-    chunks: Iterator[llama_types.CompletionChunk],
+    chunks: Iterator[llama_types.CreateCompletionStreamResponse],
 ) -> Iterator[llama_types.ChatCompletionChunk]:
    for i, chunk in enumerate(chunks):
        if i == 0:
@ -239,12 +242,15 @@ def _convert_text_completion_chunks_to_chat(
 def _convert_completion_to_chat(
    completion_or_chunks: Union[
-        llama_types.Completion, Iterator[llama_types.CompletionChunk]
+        llama_types.CreateCompletionResponse,
        Iterator[llama_types.CreateCompletionStreamResponse],
    ],
    stream: bool = False,
-) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+) -> Union[
    llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk]
 ]:
    if stream:
-        chunks: Iterator[llama_types.CompletionChunk] = completion_or_chunks  # type: ignore
+        chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
        return _convert_text_completion_chunks_to_chat(chunks)
    else:
        completion: llama_types.Completion = completion_or_chunks  # type: ignore
@ -329,7 +335,9 @@ def get_chat_format(name: str):
        )
-def hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path: Union[str, os.PathLike[str]]) -> ChatFormatter:
+def hf_autotokenizer_to_chat_formatter(
    pretrained_model_name_or_path: Union[str, os.PathLike[str]]
 ) -> ChatFormatter:
    # https://huggingface.co/docs/transformers/main/chat_templating
    # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
    # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/blob/main/tokenizer_config.json
@ -538,7 +546,7 @@ def functionary_chat_handler(
    llama: llama.Llama,
    messages: List[llama_types.ChatCompletionRequestMessage],
    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-    function_call: Optional[Union[str, llama_types.ChatCompletionFunctionCall]] = None,
+    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
    temperature: float = 0.2,
    top_p: float = 0.95,
    top_k: int = 40,
@ -555,6 +563,7 @@ def functionary_chat_handler(
    model: Optional[str] = None,
    logits_processor: Optional[llama.LogitsProcessorList] = None,
    grammar: Optional[llama.LlamaGrammar] = None,
    **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
    SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
@ -613,13 +622,13 @@ def functionary_chat_handler(
        all_messages: List[llama_types.ChatCompletionRequestMessage] = []
        if functions is not None:
            all_messages.append(
-                llama_types.ChatCompletionRequestMessage(
+                llama_types.ChatCompletionRequestSystemMessage(
                    role="system", content=generate_schema_from_functions(functions)
                )
            )
        all_messages.append(
-            llama_types.ChatCompletionRequestMessage(
+            llama_types.ChatCompletionRequestSystemMessage(
                role="system", content=SYSTEM_MESSAGE
            )
        )
@ -636,7 +645,9 @@ def functionary_chat_handler(
            all_messages.append(message)
        all_messages.append(
-            llama_types.ChatCompletionRequestMessage(role="assistant", content=None)
+            llama_types.ChatCompletionRequestAssistantMessage(
                role="assistant", content=None
            )
        )
        def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
@ -713,6 +724,10 @@ def functionary_chat_handler(
        prompt=new_prompt, stop=["user:", "</s>"], stream=False
    )  # type: ignore
    assert "usage" in completion
    assert isinstance(function_call, str)
    assert stream is False # TODO: support stream mode
    return llama_types.CreateChatCompletionResponse(
        id="chat" + completion["id"],
        object="chat.completion",
@ -734,3 +749,119 @@ def functionary_chat_handler(
        ],
        usage=completion["usage"],
    )
 class Llava15ChatHandler:
    def __init__(self, clip_model_path: str):
        import llama_cpp.llava_cpp as llava_cpp
        self._llava_cpp = llava_cpp
        self.clip_model_path = clip_model_path
        self.clip_ctx = self._llava_cpp.clip_model_load(self.clip_model_path.encode(), 0)
    def __del__(self):
        if self.clip_ctx is not None:
            self._llava_cpp.clip_free(self.clip_ctx)
            self.clip_ctx = None
    def load_image(self, image_url: str) -> bytes:
        if image_url.startswith("data:"):
            import base64
            image_bytes = base64.b64decode(image_url.split(",")[1])
            return image_bytes
        else:
            import urllib.request
            with urllib.request.urlopen(image_url) as f:
                image_bytes = f.read()
                return image_bytes
    def __call__(
        self,
        *,
        llama: llama.Llama,
        messages: List[llama_types.ChatCompletionRequestMessage],
        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
        temperature: float = 0.2,
        top_p: float = 0.95,
        top_k: int = 40,
        stream: bool = False,
        stop: Optional[Union[str, List[str]]] = [],
        max_tokens: int = 256,
        presence_penalty: float = 0.0,
        frequency_penalty: float = 0.0,
        repeat_penalty: float = 1.1,
        tfs_z: float = 1.0,
        mirostat_mode: int = 0,
        mirostat_tau: float = 5.0,
        mirostat_eta: float = 0.1,
        model: Optional[str] = None,
        logits_processor: Optional[llama.LogitsProcessorList] = None,
        grammar: Optional[llama.LlamaGrammar] = None,
        **kwargs,  # type: ignore
    ) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse]]:
        assert llama.context_params.logits_all is True # BUG: logits_all=True is required for llava
        assert self.clip_ctx is not None
        system_prompt = _get_system_message(messages)
        system_prompt = system_prompt if system_prompt != "" else "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
        system_prompt =  "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
        user_role = "\nUSER:"
        assistant_role = "\nASSISTANT:"
        llama.reset()
        llama.eval(llama.tokenize(system_prompt.encode("utf8"), add_bos=True))
        for message in messages:
            if message["role"] == "user" and message["content"] is not None:
                if isinstance(message["content"], str):
                    llama.eval(llama.tokenize(f"{user_role} {message['content']}".encode("utf8"), add_bos=False))
                else:
                    assert isinstance(message["content"], list)
                    llama.eval(llama.tokenize(f"{user_role} ".encode("utf8"), add_bos=False))
                    for content in message["content"]:
                        if content["type"] == "text":
                            llama.eval(llama.tokenize(f"{content['text']}".encode("utf8"), add_bos=False))
                        if content["type"] == "image_url":
                            image_bytes = self.load_image(content["image_url"]["url"]) if isinstance(content["image_url"], dict) else self.load_image(content["image_url"])
                            import array
                            data_array =  array.array('B', image_bytes)
                            c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array)
                            embed = self._llava_cpp.llava_image_embed_make_with_bytes(ctx_clip=self.clip_ctx, n_threads=llama.context_params.n_threads, image_bytes=c_ubyte_ptr, image_bytes_length=len(image_bytes))
                            # image_bytes_p = (ctypes.c_uint8 * len(image_bytes)).from_buffer_copy(image_bytes)
                            # embed = self._llava_cpp.llava_image_embed_make_with_bytes(ctx_clip=self.clip_ctx, n_threads=1, image_bytes=image_bytes_p, image_bytes_length=len(image_bytes))
                            try:
                                n_past = ctypes.c_int(llama.n_tokens)
                                n_past_p = ctypes.pointer(n_past)
                                self._llava_cpp.llava_eval_image_embed(ctx_llama=llama.ctx, embed=embed, n_batch=llama.n_batch, n_past=n_past_p)
                                assert llama.n_ctx() >= n_past.value
                                llama.n_tokens = n_past.value
                            finally:
                                self._llava_cpp.llava_image_embed_free(embed)
            if message["role"] == "assistant" and message["content"] is not None:
                llama.eval(llama.tokenize(f"ASSISTANT: {message['content']}".encode("utf8"), add_bos=False))
        llama.eval(llama.tokenize(f"{assistant_role}".encode("utf8"), add_bos=False))
        prompt = llama._input_ids.tolist()
        return _convert_completion_to_chat(llama.create_completion(
            prompt=prompt,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            stream=stream,
            stop=stop,
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
            frequency_penalty=frequency_penalty,
            repeat_penalty=repeat_penalty,
            tfs_z=tfs_z,
            mirostat_mode=mirostat_mode,
            mirostat_tau=mirostat_tau,
            mirostat_eta=mirostat_eta,
            model=model,
            logits_processor=logits_processor,
            grammar=grammar,
        ), stream=stream)
--- a/llama_cpp/llama_grammar.py
+++ b/llama_cpp/llama_grammar.py
@ -19,7 +19,7 @@ from typing import (
    overload,
 )
-from . import llama_cpp
+import llama_cpp.llama_cpp as llama_cpp
 # Type aliases
 llama_grammar_element = llama_cpp.llama_grammar_element
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@ -1,4 +1,6 @@
-"""Types and request signatrues for OpenAI compatibility
+"""Types and request signatures for OpenAI compatibility
 NOTE: These types may change to match the OpenAI OpenAPI specification.
 Based on the OpenAI OpenAPI specification:
 https://github.com/openai/openai-openapi/blob/master/openapi.yaml
@ -8,6 +10,12 @@ from typing import Any, List, Optional, Dict, Union
 from typing_extensions import TypedDict, NotRequired, Literal
 # NOTE: Defining this correctly using annotations seems to break pydantic validation.
 #       This is a workaround until we can figure out how to do this correctly
 # JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
 JsonType = Union[None, int, str, bool, List[Any], Dict[str, Any]]
 class EmbeddingUsage(TypedDict):
    prompt_tokens: int
    total_tokens: int
@ -19,9 +27,6 @@ class Embedding(TypedDict):
    embedding: List[float]
 EmbeddingData = Embedding
 class CreateEmbeddingResponse(TypedDict):
    object: Literal["list"]
    model: str
@ -49,110 +54,92 @@ class CompletionUsage(TypedDict):
    total_tokens: int
 class CreateCompletionStreamResponse(TypedDict):
    id: str
    object: Literal["text_completion"]
    created: int
    model: str
    choices: List[CompletionChoice]
 CompletionChunk = CreateCompletionStreamResponse
 class CreateCompletionResponse(TypedDict):
    id: str
    object: Literal["text_completion"]
    created: int
    model: str
    choices: List[CompletionChoice]
-    usage: CompletionUsage
+    usage: NotRequired[CompletionUsage]
-Completion = CreateCompletionResponse
+class ChatCompletionResponseFunctionCall(TypedDict):
 class ChatCompletionFunctionCall(TypedDict):
    name: str
    arguments: str
 class ChatCompletionResponseMessage(TypedDict):
    role: Literal["assistant", "user", "system", "function"]
    content: Optional[str]
-    user: NotRequired[str]
+    tool_calls: NotRequired["ChatCompletionMessageToolCalls"]
-    function_call: NotRequired[ChatCompletionFunctionCall]
+    role: Literal["assistant", "function"]  # NOTE: "function" may be incorrect here
    function_call: NotRequired[ChatCompletionResponseFunctionCall]  # DEPRECATED
-ChatCompletionMessage = ChatCompletionResponseMessage
+class ChatCompletionFunction(TypedDict):
 class ChatCompletionResponseFunction(TypedDict):
    name: str
    description: NotRequired[str]
-    parameters: Dict[str, Any]  # TODO: make this more specific
+    parameters: Dict[str, JsonType]  # TODO: make this more specific
 ChatCompletionFunction = ChatCompletionResponseFunction
 class ChatCompletionResponseChoice(TypedDict):
    index: int
-    message: ChatCompletionMessage
+    message: "ChatCompletionResponseMessage"
    finish_reason: Optional[str]
 ChatCompletionChoice = ChatCompletionResponseChoice
 class CreateChatCompletionResponse(TypedDict):
    id: str
    object: Literal["chat.completion"]
    created: int
    model: str
-    choices: List[ChatCompletionChoice]
+    choices: List["ChatCompletionResponseChoice"]
    usage: CompletionUsage
-ChatCompletion = CreateChatCompletionResponse
+class ChatCompletionMessageToolCallChunkFunction(TypedDict):
    name: str
    arguments: str
 class ChatCompletionMessageToolCallChunk(TypedDict):
    index: int
    id: NotRequired[str]
    type: Literal["function"]
    function: ChatCompletionMessageToolCallChunkFunction
 class ChatCompletionStreamResponseDeltaEmpty(TypedDict):
    pass
-ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty
+class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
    name: str
    arguments: str
 class ChatCompletionStreamResponseDelta(TypedDict):
    role: NotRequired[Literal["assistant"]]
    content: NotRequired[str]
-    function_call: NotRequired[ChatCompletionFunctionCall]
+    function_call: NotRequired[
-
+        ChatCompletionStreamResponseDeltaFunctionCall
-
+    ]  # DEPRECATED
-ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
+    tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]]
    role: NotRequired[Literal["system", "user", "assistant", "tool"]]
 class ChatCompletionStreamResponseChoice(TypedDict):
    index: int
-    delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty]
+    delta: Union[
        ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
    ]
    finish_reason: Optional[Literal["stop", "length", "function_call"]]
-ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice
+class CreateChatCompletionStreamResponse(TypedDict):
 class ChatCompletionStreamResponse(TypedDict):
    id: str
    model: str
    object: Literal["chat.completion.chunk"]
    created: int
-    choices: List[ChatCompletionChunkChoice]
+    choices: List[ChatCompletionStreamResponseChoice]
 ChatCompletionChunk = ChatCompletionStreamResponse
 JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
 class ChatCompletionFunctions(TypedDict):
@ -165,8 +152,137 @@ class ChatCompletionFunctionCallOption(TypedDict):
    name: str
-class ChatCompletionRequestMessage(TypedDict):
+class ChatCompletionRequestMessageContentPartText(TypedDict):
-    role: Literal["assistant", "user", "system", "function"]
+    type: Literal["text"]
    text: str
 class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict):
    url: str
    detail: NotRequired[Literal["auto", "low", "high"]]
 class ChatCompletionRequestMessageContentPartImage(TypedDict):
    type: Literal["image_url"]
    image_url: Union[str, ChatCompletionRequestMessageContentPartImageImageUrl]
 ChatCompletionRequestMessageContentPart = Union[
    ChatCompletionRequestMessageContentPartText,
    ChatCompletionRequestMessageContentPartImage,
 ]
 class ChatCompletionRequestSystemMessage(TypedDict):
    role: Literal["system"]
    content: Optional[str]
-    name: NotRequired[str]
+
-    function_call: NotRequired[ChatCompletionFunctionCall]
+
 class ChatCompletionRequestUserMessage(TypedDict):
    role: Literal["user"]
    content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]]
 class ChatCompletionMessageToolCallFunction(TypedDict):
    name: str
    arguments: str
 class ChatCompletionMessageToolCall(TypedDict):
    id: str
    type: Literal["function"]
    function: ChatCompletionMessageToolCallFunction
 ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall]
 class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict):
    name: str
    arguments: str
 class ChatCompletionRequestAssistantMessage(TypedDict):
    role: Literal["assistant"]
    content: Optional[str]
    tool_calls: NotRequired[ChatCompletionMessageToolCalls]
    function_call: NotRequired[
        ChatCompletionRequestAssistantMessageFunctionCall
    ]  # DEPRECATED
 class ChatCompletionRequestToolMessage(TypedDict):
    role: Literal["tool"]
    content: Optional[str]
    tool_call_id: str
 class ChatCompletionRequestFunctionMessage(TypedDict):
    role: Literal["function"]
    content: Optional[str]
    name: str
 ChatCompletionRequestMessage = Union[
    ChatCompletionRequestSystemMessage,
    ChatCompletionRequestUserMessage,
    ChatCompletionRequestAssistantMessage,
    ChatCompletionRequestUserMessage,
    ChatCompletionRequestToolMessage,
    ChatCompletionRequestFunctionMessage,
 ]
 class ChatCompletionRequestFunctionCallOption(TypedDict):
    name: str
 ChatCompletionRequestFunctionCall = Union[
    Literal["none", "auto"], ChatCompletionRequestFunctionCallOption
 ]
 ChatCompletionFunctionParameters = Dict[str, JsonType] # TODO: make this more specific
 class ChatCompletionToolFunction(TypedDict):
    name: str
    description: NotRequired[str]
    parameters: ChatCompletionFunctionParameters
 class ChatCompletionTool(TypedDict):
    type: Literal["function"]
    function: ChatCompletionToolFunction
 class ChatCompletionNamedToolChoiceFunction(TypedDict):
    name: str
 class ChatCompletionNamedToolChoice(TypedDict):
    type: Literal["function"]
    function: ChatCompletionNamedToolChoiceFunction
 ChatCompletionToolChoiceOption = Union[
    Literal["none", "auto"], ChatCompletionNamedToolChoice
 ]
 # NOTE: The following type names are not part of the OpenAI OpenAPI specification
 # and will be removed in a future major release.
 EmbeddingData = Embedding
 CompletionChunk = CreateCompletionResponse
 Completion = CreateCompletionResponse
 CreateCompletionStreamResponse = CreateCompletionResponse
 ChatCompletionMessage = ChatCompletionResponseMessage
 ChatCompletionChoice = ChatCompletionResponseChoice
 ChatCompletion = CreateChatCompletionResponse
 ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty
 ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice
 ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
 ChatCompletionChunk = CreateChatCompletionStreamResponse
 ChatCompletionStreamResponse = CreateChatCompletionStreamResponse
 ChatCompletionResponseFunction = ChatCompletionFunction
 ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@ -0,0 +1,232 @@
 import sys
 import os
 import ctypes
 from ctypes import (
    c_bool,
    c_char_p,
    c_int,
    c_int8,
    c_int32,
    c_uint8,
    c_uint32,
    c_size_t,
    c_float,
    c_double,
    c_void_p,
    POINTER,
    _Pointer,  # type: ignore
    Structure,
    Array,
 )
 import pathlib
 from typing import List, Union
 import llama_cpp.llama_cpp as llama_cpp
 # Load the library
 def _load_shared_library(lib_base_name: str):
    # Construct the paths to the possible shared library names
    _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
    # Searching for the library in the current directory under the name "libllama" (default name
    # for llamacpp) and "llama" (default name for this repo)
    _lib_paths: List[pathlib.Path] = []
    # Determine the file extension based on the platform
    if sys.platform.startswith("linux"):
        _lib_paths += [
            _base_path / f"lib{lib_base_name}.so",
        ]
    elif sys.platform == "darwin":
        _lib_paths += [
            _base_path / f"lib{lib_base_name}.so",
            _base_path / f"lib{lib_base_name}.dylib",
        ]
    elif sys.platform == "win32":
        _lib_paths += [
            _base_path / f"{lib_base_name}.dll",
            _base_path / f"lib{lib_base_name}.dll",
        ]
    else:
        raise RuntimeError("Unsupported platform")
    if "LLAMA_CPP_LIB" in os.environ:
        lib_base_name = os.environ["LLAMA_CPP_LIB"]
        _lib = pathlib.Path(lib_base_name)
        _base_path = _lib.parent.resolve()
        _lib_paths = [_lib.resolve()]
    cdll_args = dict()  # type: ignore
    # Add the library directory to the DLL search path on Windows (if needed)
    if sys.platform == "win32" and sys.version_info >= (3, 8):
        os.add_dll_directory(str(_base_path))
        if "CUDA_PATH" in os.environ:
            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
        cdll_args["winmode"] = ctypes.RTLD_GLOBAL
    # Try to load the shared library, handling potential errors
    for _lib_path in _lib_paths:
        if _lib_path.exists():
            try:
                return ctypes.CDLL(str(_lib_path), **cdll_args)
            except Exception as e:
                raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
    raise FileNotFoundError(
        f"Shared library with base name '{lib_base_name}' not found"
    )
 # Specify the base name of the shared library to load
 _libllava_base_name = "llava"
 # Load the library
 _libllava = _load_shared_library(_libllava_base_name)
 ################################################
 # llava.h
 ################################################
 # struct clip_ctx;
 clip_ctx_p = c_void_p
 # struct llava_image_embed {
 #     float * embed;
 #     int n_image_pos;
 # };
 class llava_image_embed(Structure):
    _fields_ = [
        ("embed", POINTER(c_float)),
        ("n_image_pos", c_int),
    ]
 # /** sanity check for clip <-> llava embed size match */
 # LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
 def llava_validate_embed_size(ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p) -> bool:
    return _libllava.llava_validate_embed_size(ctx_llama, ctx_clip)
 _libllava.llava_validate_embed_size.argtypes = [llama_cpp.llama_context_p, clip_ctx_p]
 _libllava.llava_validate_embed_size.restype = c_bool
 # /** build an image embed from image file bytes */
 # LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_bytes: bytes, image_bytes_length: Union[c_int, int]) -> "_Pointer[llava_image_embed]":
    return _libllava.llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length)
 _libllava.llava_image_embed_make_with_bytes.argtypes = [clip_ctx_p, c_int, POINTER(c_uint8), c_int]
 _libllava.llava_image_embed_make_with_bytes.restype = POINTER(llava_image_embed)
 # /** build an image embed from a path to an image filename */
 # LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
 def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes) -> "_Pointer[llava_image_embed]":
    return _libllava.llava_image_embed_make_with_filename(ctx_clip, n_threads, image_path)
 _libllava.llava_image_embed_make_with_filename.argtypes = [clip_ctx_p, c_int, c_char_p]
 _libllava.llava_image_embed_make_with_filename.restype = POINTER(llava_image_embed)
 # LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 # /** free an embedding made with llava_image_embed_make_* */
 def llava_image_embed_free(embed: "_Pointer[llava_image_embed]"):
    return _libllava.llava_image_embed_free(embed)
 _libllava.llava_image_embed_free.argtypes = [POINTER(llava_image_embed)]
 _libllava.llava_image_embed_free.restype = None
 # /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
 # LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
 def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointer[llava_image_embed]", n_batch: Union[c_int, int], n_past: "_Pointer[c_int]") -> bool:
    return _libllava.llava_eval_image_embed(ctx_llama, embed, n_batch, n_past)
 _libllava.llava_eval_image_embed.argtypes = [llama_cpp.llama_context_p, POINTER(llava_image_embed), c_int, POINTER(c_int)]
 _libllava.llava_eval_image_embed.restype = c_bool
 ################################################
 # clip.h
 ################################################
 # struct clip_vision_hparams {
 #     int32_t image_size;
 #     int32_t patch_size;
 #     int32_t hidden_size;
 #     int32_t n_intermediate;
 #     int32_t projection_dim;
 #     int32_t n_head;
 #     int32_t n_layer;
 #     float eps;
 # };
 class clip_vision_hparams(Structure):
    _fields_ = [
        ("image_size", c_int32),
        ("patch_size", c_int32),
        ("hidden_size", c_int32),
        ("n_intermediate", c_int32),
        ("projection_dim", c_int32),
        ("n_head", c_int32),
        ("n_layer", c_int32),
        ("eps", c_float),
    ]
 # /** load mmproj model */
 # CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
 def clip_model_load(fname: bytes, verbosity: Union[c_int, int]) -> clip_ctx_p:
    return _libllava.clip_model_load(fname, verbosity)
 _libllava.clip_model_load.argtypes = [c_char_p, c_int]
 _libllava.clip_model_load.restype = clip_ctx_p
 # /** free mmproj model */
 # CLIP_API void clip_free(struct clip_ctx * ctx);
 def clip_free(ctx: clip_ctx_p):
    return _libllava.clip_free(ctx)
 _libllava.clip_free.argtypes = [clip_ctx_p]
 _libllava.clip_free.restype = None
 # size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 # int clip_n_patches(const struct clip_ctx * ctx);
 # int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 # // RGB uint8 image
 # struct clip_image_u8 {
 #     int nx;
 #     int ny;
 #     uint8_t * data = NULL;
 #     size_t size;
 # };
 # // RGB float32 image (NHWC)
 # // Memory layout: RGBRGBRGB...
 # struct clip_image_f32 {
 #     int nx;
 #     int ny;
 #     float * data = NULL;
 #     size_t size;
 # };
 # struct clip_image_u8_batch {
 #     struct clip_image_u8 * data;
 #     size_t size;
 # };
 # struct clip_image_f32_batch {
 #     struct clip_image_f32 * data;
 #     size_t size;
 # };
 # struct clip_image_u8 * make_clip_image_u8();
 # struct clip_image_f32 * make_clip_image_f32();
 # CLIP_API void clip_image_u8_free(clip_image_u8 * img);
 # CLIP_API void clip_image_f32_free(clip_image_f32 * img);
 # CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
 # /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
 # CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
 # bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
 # bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
 # bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
 #                              float * vec);
 # bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@ -138,6 +138,10 @@ class Settings(BaseSettings):
        default="llama-2",
        description="Chat format to use.",
    )
    clip_model_path: Optional[str] = Field(
        default=None,
        description="Path to a CLIP model to use for multi-modal chat completion.",
    )
    # Cache Params
    cache: bool = Field(
        default=False,
@ -375,6 +379,14 @@ def create_app(settings: Optional[Settings] = None):
    )
    app.include_router(router)
    global llama
    ##
    chat_handler = None
    if settings.chat_format == "llava-1-5":
        assert settings.clip_model_path is not None
        chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path)
    ##
    llama = llama_cpp.Llama(
        model_path=settings.model,
        # Model Params
@ -411,6 +423,7 @@ def create_app(settings: Optional[Settings] = None):
        numa=settings.numa,
        # Chat Format Params
        chat_format=settings.chat_format,
        chat_handler=chat_handler,
        # Misc
        verbose=settings.verbose,
    )
@ -580,10 +593,6 @@ class CreateCompletionRequest(BaseModel):
    max_tokens: int = max_tokens_field
    temperature: float = temperature_field
    top_p: float = top_p_field
    mirostat_mode: int = mirostat_mode_field
    mirostat_tau: float = mirostat_tau_field
    mirostat_eta: float = mirostat_eta_field
    grammar: Optional[str] = None
    echo: bool = Field(
        default=False,
        description="Whether to echo the prompt in the generated text. Useful for chatbots.",
@ -610,6 +619,10 @@ class CreateCompletionRequest(BaseModel):
    top_k: int = top_k_field
    repeat_penalty: float = repeat_penalty_field
    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
    mirostat_mode: int = mirostat_mode_field
    mirostat_tau: float = mirostat_tau_field
    mirostat_eta: float = mirostat_eta_field
    grammar: Optional[str] = None
    model_config = {
        "json_schema_extra": {
@ -688,7 +701,7 @@ async def create_completion(
        kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
    iterator_or_completion: Union[
-        llama_cpp.Completion, Iterator[llama_cpp.CompletionChunk]
+        llama_cpp.CreateCompletionResponse, Iterator[llama_cpp.CreateCompletionStreamResponse]
    ] = await run_in_threadpool(llama, **kwargs)
    if isinstance(iterator_or_completion, Iterator):
@ -697,7 +710,7 @@ async def create_completion(
        # If no exception was raised from first_response, we can assume that
        # the iterator is valid and we can use it to stream the response.
-        def iterator() -> Iterator[llama_cpp.CompletionChunk]:
+        def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
            yield first_response
            yield from iterator_or_completion
@ -748,27 +761,30 @@ class ChatCompletionRequestMessage(BaseModel):
    )
    content: Optional[str] = Field(default="", description="The content of the message.")
 from typing import Any
 class CreateChatCompletionRequest(BaseModel):
-    messages: List[Any] = Field(
+    messages: List[llama_cpp.ChatCompletionRequestMessage] = Field(
        default=[], description="A list of messages to generate completions for."
    )
    functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field(
        default=None,
        description="A list of functions to apply to the generated completions.",
    )
-    function_call: Optional[Union[Literal["auto", "none"], llama_cpp.ChatCompletionFunctionCallOption]] = Field(
+    function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field(
        default=None,
        description="A function to apply to the generated completions.",
    )
    tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field(
        default=None,
        description="A list of tools to apply to the generated completions.",
    )
    tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field(
        default=None,
        description="A tool to apply to the generated completions.",
    ) # TODO: verify
    max_tokens: int = max_tokens_field
    temperature: float = temperature_field
    top_p: float = top_p_field
    mirostat_mode: int = mirostat_mode_field
    mirostat_tau: float = mirostat_tau_field
    mirostat_eta: float = mirostat_eta_field
    grammar: Optional[str] = None
    stop: Optional[List[str]] = stop_field
    stream: bool = stream_field
    presence_penalty: Optional[float] = presence_penalty_field
@ -784,6 +800,10 @@ class CreateChatCompletionRequest(BaseModel):
    top_k: int = top_k_field
    repeat_penalty: float = repeat_penalty_field
    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
    mirostat_mode: int = mirostat_mode_field
    mirostat_tau: float = mirostat_tau_field
    mirostat_eta: float = mirostat_eta_field
    grammar: Optional[str] = None
    model_config = {
        "json_schema_extra": {
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 2833a6f63c1b87c7f4ac574bcf7a15a2f3bf3ede
+Subproject commit 381efbf480959bb6d1e247a8b0c2328f22e350f8
		`@ -1 +1 @@`
			`Subproject commit 2833a6f63c1b87c7f4ac574bcf7a15a2f3bf3ede`				`Subproject commit 381efbf480959bb6d1e247a8b0c2328f22e350f8`