Multimodal Support (Llava 1.5) (#821)

* llava v1.5 integration

* Point llama.cpp to fork

* Add llava shared library target

* Fix type

* Update llama.cpp

* Add llava api

* Revert changes to llama and llama_cpp

* Update llava example

* Add types for new gpt-4-vision-preview api

* Fix typo

* Update llama.cpp

* Update llama_types to match OpenAI v1 API

* Update ChatCompletionFunction type

* Reorder request parameters

* More API type fixes

* Even More Type Updates

* Add parameter for custom chat_handler to Llama class

* Fix circular import

* Convert to absolute imports

* Fix

* Fix pydantic Jsontype bug

* Accept list of prompt tokens in create_completion

* Add llava1.5 chat handler

* Add Multimodal notebook

* Clean up examples

* Add server docs

---------

Co-authored-by: Andrei Betlen <abetlen@gmail.com>
This commit is contained in:
Damian Stewart 2023-11-08 04:48:51 +01:00 committed by GitHub
parent 56171cf7bf
commit aab74f0b2b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 796 additions and 102 deletions

View file

@ -41,4 +41,23 @@ if (LLAMA_BUILD)
FILES $<TARGET_RUNTIME_DLLS:llama> FILES $<TARGET_RUNTIME_DLLS:llama>
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
) )
add_subdirectory(vendor/llama.cpp/examples/llava)
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
install(
TARGETS llava_shared
LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
)
# Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
install(
TARGETS llava_shared
LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
)
endif() endif()

77
docs/server.md Normal file
View file

@ -0,0 +1,77 @@
# OpenAI Compatible Server
`llama-cpp-python` offers an OpenAI API compatible web server.
This web server can be used to serve local models and easily connect them to existing clients.
## Setup
### Installation
The server can be installed by running the following command:
```bash
pip install llama-cpp-python[server]
```
### Running the server
The server can then be started by running the following command:
```bash
python3 -m llama_cpp.server --model <model_path>
```
### Server options
For a full list of options, run:
```bash
python3 -m llama_cpp.server --help
```
NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.
## Guides
### Multi-modal Models
`llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to
read information from both text and images.
You'll first need to download one of the available multi-modal models in GGUF format:
- [llava1.5 7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
- [llava1.5 13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
Then when you run the server you'll need to also specify the path to the clip model used for image embedding
```bash
python3 -m llama_cpp.server --model <model_path> --clip-model-path <clip_model_path>
```
Then you can just use the OpenAI API as normal
```python3
from openai import OpenAI
client = OpenAI(base_url="http://<host>:<port>/v1", api_key="sk-xxx")
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "<image_url>"
},
},
{"type": "text", "text": "What does the image say"},
],
}
],
)
print(response)
```

View file

@ -0,0 +1,84 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ChatCompletion(id='chatcmpl-65a710ba-41d1-4d0a-a124-a44b2b4a0189', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content=' The image reads \"LlamaC++.\"', role='assistant', function_call=None, tool_calls=None))], created=1699413274, model='gpt-4-vision-preview', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=10, prompt_tokens=624, total_tokens=634))\n"
]
}
],
"source": [
"from openai import OpenAI\n",
"\n",
"import urllib.request\n",
"import base64\n",
"\n",
"def get_data_url(url):\n",
" return \"data:image/png;base64,\" + base64.b64encode(urllib.request.urlopen(url).read()).decode(\"utf-8\")\n",
"\n",
"client = OpenAI(base_url=\"http://100.64.159.73:8000/v1\", api_key=\"sk-1234\")\n",
"response = client.chat.completions.create(\n",
" model=\"gpt-4-vision-preview\",\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"image_url\",\n",
" \"image_url\": {\n",
" \"url\": get_data_url(\"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\"),\n",
" # \"url\": \"https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png\",\n",
" },\n",
" },\n",
" {\"type\": \"text\", \"text\": \"What does the image say\"},\n",
" ],\n",
" }\n",
" ],\n",
")\n",
"print(response)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5+"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View file

@ -21,9 +21,9 @@ from collections import deque, OrderedDict
import diskcache import diskcache
import ctypes import ctypes
from . import llama_cpp
from .llama_types import * from .llama_types import *
from .llama_grammar import LlamaGrammar from .llama_grammar import LlamaGrammar
import llama_cpp.llama_cpp as llama_cpp
import llama_cpp.llama_chat_format as llama_chat_format import llama_cpp.llama_chat_format as llama_chat_format
import numpy as np import numpy as np
@ -752,6 +752,7 @@ class Llama:
numa: bool = False, numa: bool = False,
# Chat Format Params # Chat Format Params
chat_format: str = "llama-2", chat_format: str = "llama-2",
chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
# Misc # Misc
verbose: bool = True, verbose: bool = True,
# Extra Params # Extra Params
@ -784,6 +785,7 @@ class Llama:
lora_path: Path to a LoRA file to apply to the model. lora_path: Path to a LoRA file to apply to the model.
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init) numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
chat_format: String specifying the chat format to use when calling create_chat_completion. chat_format: String specifying the chat format to use when calling create_chat_completion.
chat_handler: Optional chat handler to use when calling create_chat_completion.
verbose: Print verbose output to stderr. verbose: Print verbose output to stderr.
Raises: Raises:
@ -910,6 +912,7 @@ class Llama:
print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
self.chat_format = chat_format self.chat_format = chat_format
self.chat_handler = chat_handler
self._n_vocab = self.n_vocab() self._n_vocab = self.n_vocab()
self._n_ctx = self.n_ctx() self._n_ctx = self.n_ctx()
@ -1231,7 +1234,7 @@ class Llama:
else: else:
inputs = input inputs = input
data: List[EmbeddingData] = [] data: List[Embedding] = []
total_tokens = 0 total_tokens = 0
for index, input in enumerate(inputs): for index, input in enumerate(inputs):
tokens = self.tokenize(input.encode("utf-8"), special=True) tokens = self.tokenize(input.encode("utf-8"), special=True)
@ -1276,7 +1279,7 @@ class Llama:
def _create_completion( def _create_completion(
self, self,
prompt: str, prompt: Union[str, List[int]],
suffix: Optional[str] = None, suffix: Optional[str] = None,
max_tokens: int = 16, max_tokens: int = 16,
temperature: float = 0.8, temperature: float = 0.8,
@ -1297,7 +1300,9 @@ class Llama:
stopping_criteria: Optional[StoppingCriteriaList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None, grammar: Optional[LlamaGrammar] = None,
) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: ) -> Union[
Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
]:
assert self._ctx is not None assert self._ctx is not None
assert suffix is None or suffix.__class__ is str assert suffix is None or suffix.__class__ is str
@ -1309,7 +1314,7 @@ class Llama:
self.tokenize(prompt.encode("utf-8"), special=True) self.tokenize(prompt.encode("utf-8"), special=True)
if prompt != "" if prompt != ""
else [self.token_bos()] else [self.token_bos()]
) ) if isinstance(prompt, str) else prompt
text: bytes = b"" text: bytes = b""
returned_tokens: int = 0 returned_tokens: int = 0
stop = ( stop = (
@ -1322,7 +1327,7 @@ class Llama:
if len(prompt_tokens) >= self._n_ctx: if len(prompt_tokens) >= self._n_ctx:
raise ValueError( raise ValueError(
f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self._ctx)}" f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
) )
if max_tokens <= 0: if max_tokens <= 0:
@ -1732,7 +1737,7 @@ class Llama:
def create_completion( def create_completion(
self, self,
prompt: str, prompt: Union[str, List[int]],
suffix: Optional[str] = None, suffix: Optional[str] = None,
max_tokens: int = 128, max_tokens: int = 128,
temperature: float = 0.8, temperature: float = 0.8,
@ -1753,7 +1758,7 @@ class Llama:
stopping_criteria: Optional[StoppingCriteriaList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None, grammar: Optional[LlamaGrammar] = None,
) -> Union[Completion, Iterator[CompletionChunk]]: ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
"""Generate text from a prompt. """Generate text from a prompt.
Args: Args:
@ -1800,7 +1805,7 @@ class Llama:
grammar=grammar, grammar=grammar,
) )
if stream: if stream:
chunks: Iterator[CompletionChunk] = completion_or_chunks chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
return chunks return chunks
completion: Completion = next(completion_or_chunks) # type: ignore completion: Completion = next(completion_or_chunks) # type: ignore
return completion return completion
@ -1828,7 +1833,7 @@ class Llama:
stopping_criteria: Optional[StoppingCriteriaList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None, grammar: Optional[LlamaGrammar] = None,
) -> Union[Completion, Iterator[CompletionChunk]]: ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
"""Generate text from a prompt. """Generate text from a prompt.
Args: Args:
@ -1879,7 +1884,9 @@ class Llama:
self, self,
messages: List[ChatCompletionRequestMessage], messages: List[ChatCompletionRequestMessage],
functions: Optional[List[ChatCompletionFunction]] = None, functions: Optional[List[ChatCompletionFunction]] = None,
function_call: Optional[Union[str, ChatCompletionFunctionCall]] = None, function_call: Optional[ChatCompletionRequestFunctionCall] = None,
tools: Optional[List[ChatCompletionTool]] = None,
tool_choice: Optional[ChatCompletionToolChoiceOption] = None,
temperature: float = 0.2, temperature: float = 0.2,
top_p: float = 0.95, top_p: float = 0.95,
top_k: int = 40, top_k: int = 40,
@ -1896,7 +1903,9 @@ class Llama:
model: Optional[str] = None, model: Optional[str] = None,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: Optional[LogitsProcessorList] = None,
grammar: Optional[LlamaGrammar] = None, grammar: Optional[LlamaGrammar] = None,
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: ) -> Union[
CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
]:
"""Generate a chat completion from a list of messages. """Generate a chat completion from a list of messages.
Args: Args:
@ -1912,12 +1921,16 @@ class Llama:
Returns: Returns:
Generated chat completion or a stream of chat completion chunks. Generated chat completion or a stream of chat completion chunks.
""" """
handler = llama_chat_format.get_chat_completion_handler(self.chat_format) handler = self.chat_handler or llama_chat_format.get_chat_completion_handler(
self.chat_format
)
return handler( return handler(
self, llama=self,
messages=messages, messages=messages,
functions=functions, functions=functions,
function_call=function_call, function_call=function_call,
tools=tools,
tool_choice=tool_choice,
temperature=temperature, temperature=temperature,
top_p=top_p, top_p=top_p,
top_k=top_k, top_k=top_k,
@ -1974,6 +1987,7 @@ class Llama:
numa=self.numa, numa=self.numa,
# Chat Format Params # Chat Format Params
chat_format=self.chat_format, chat_format=self.chat_format,
chat_handler=self.chat_handler,
# Misc # Misc
verbose=self.verbose, verbose=self.verbose,
) )
@ -2015,6 +2029,7 @@ class Llama:
numa=state["numa"], numa=state["numa"],
# Chat Format Params # Chat Format Params
chat_format=state["chat_format"], chat_format=state["chat_format"],
chat_handler=state["chat_handler"],
# Misc # Misc
verbose=state["verbose"], verbose=state["verbose"],
) )

View file

@ -1,22 +1,24 @@
from __future__ import annotations from __future__ import annotations
import os import os
import ctypes
import dataclasses import dataclasses
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol
from . import llama_types import llama_cpp.llama_types as llama_types
from . import llama import llama_cpp.llama as llama
class LlamaChatCompletionHandler(Protocol): class LlamaChatCompletionHandler(Protocol):
def __call__( def __call__(
self, self,
*,
llama: llama.Llama, llama: llama.Llama,
messages: List[llama_types.ChatCompletionRequestMessage], messages: List[llama_types.ChatCompletionRequestMessage],
functions: Optional[List[llama_types.ChatCompletionFunction]] = None, functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
function_call: Optional[ function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
Union[str, llama_types.ChatCompletionFunctionCall] tools: Optional[List[llama_types.ChatCompletionTool]] = None,
] = None, tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
temperature: float = 0.2, temperature: float = 0.2,
top_p: float = 0.95, top_p: float = 0.95,
top_k: int = 40, top_k: int = 40,
@ -33,7 +35,8 @@ class LlamaChatCompletionHandler(Protocol):
model: Optional[str] = None, model: Optional[str] = None,
logits_processor: Optional[llama.LogitsProcessorList] = None, logits_processor: Optional[llama.LogitsProcessorList] = None,
grammar: Optional[llama.LlamaGrammar] = None, grammar: Optional[llama.LlamaGrammar] = None,
) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: **kwargs, # type: ignore
) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse]]:
... ...
@ -199,7 +202,7 @@ def _convert_text_completion_to_chat(
def _convert_text_completion_chunks_to_chat( def _convert_text_completion_chunks_to_chat(
chunks: Iterator[llama_types.CompletionChunk], chunks: Iterator[llama_types.CreateCompletionStreamResponse],
) -> Iterator[llama_types.ChatCompletionChunk]: ) -> Iterator[llama_types.ChatCompletionChunk]:
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
if i == 0: if i == 0:
@ -239,12 +242,15 @@ def _convert_text_completion_chunks_to_chat(
def _convert_completion_to_chat( def _convert_completion_to_chat(
completion_or_chunks: Union[ completion_or_chunks: Union[
llama_types.Completion, Iterator[llama_types.CompletionChunk] llama_types.CreateCompletionResponse,
Iterator[llama_types.CreateCompletionStreamResponse],
], ],
stream: bool = False, stream: bool = False,
) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: ) -> Union[
llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk]
]:
if stream: if stream:
chunks: Iterator[llama_types.CompletionChunk] = completion_or_chunks # type: ignore chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore
return _convert_text_completion_chunks_to_chat(chunks) return _convert_text_completion_chunks_to_chat(chunks)
else: else:
completion: llama_types.Completion = completion_or_chunks # type: ignore completion: llama_types.Completion = completion_or_chunks # type: ignore
@ -329,7 +335,9 @@ def get_chat_format(name: str):
) )
def hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path: Union[str, os.PathLike[str]]) -> ChatFormatter: def hf_autotokenizer_to_chat_formatter(
pretrained_model_name_or_path: Union[str, os.PathLike[str]]
) -> ChatFormatter:
# https://huggingface.co/docs/transformers/main/chat_templating # https://huggingface.co/docs/transformers/main/chat_templating
# https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
# https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/blob/main/tokenizer_config.json # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/blob/main/tokenizer_config.json
@ -538,7 +546,7 @@ def functionary_chat_handler(
llama: llama.Llama, llama: llama.Llama,
messages: List[llama_types.ChatCompletionRequestMessage], messages: List[llama_types.ChatCompletionRequestMessage],
functions: Optional[List[llama_types.ChatCompletionFunction]] = None, functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
function_call: Optional[Union[str, llama_types.ChatCompletionFunctionCall]] = None, function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
temperature: float = 0.2, temperature: float = 0.2,
top_p: float = 0.95, top_p: float = 0.95,
top_k: int = 40, top_k: int = 40,
@ -555,6 +563,7 @@ def functionary_chat_handler(
model: Optional[str] = None, model: Optional[str] = None,
logits_processor: Optional[llama.LogitsProcessorList] = None, logits_processor: Optional[llama.LogitsProcessorList] = None,
grammar: Optional[llama.LlamaGrammar] = None, grammar: Optional[llama.LlamaGrammar] = None,
**kwargs, # type: ignore
) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
@ -613,13 +622,13 @@ def functionary_chat_handler(
all_messages: List[llama_types.ChatCompletionRequestMessage] = [] all_messages: List[llama_types.ChatCompletionRequestMessage] = []
if functions is not None: if functions is not None:
all_messages.append( all_messages.append(
llama_types.ChatCompletionRequestMessage( llama_types.ChatCompletionRequestSystemMessage(
role="system", content=generate_schema_from_functions(functions) role="system", content=generate_schema_from_functions(functions)
) )
) )
all_messages.append( all_messages.append(
llama_types.ChatCompletionRequestMessage( llama_types.ChatCompletionRequestSystemMessage(
role="system", content=SYSTEM_MESSAGE role="system", content=SYSTEM_MESSAGE
) )
) )
@ -636,7 +645,9 @@ def functionary_chat_handler(
all_messages.append(message) all_messages.append(message)
all_messages.append( all_messages.append(
llama_types.ChatCompletionRequestMessage(role="assistant", content=None) llama_types.ChatCompletionRequestAssistantMessage(
role="assistant", content=None
)
) )
def message_to_str(msg: llama_types.ChatCompletionRequestMessage): def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
@ -713,6 +724,10 @@ def functionary_chat_handler(
prompt=new_prompt, stop=["user:", "</s>"], stream=False prompt=new_prompt, stop=["user:", "</s>"], stream=False
) # type: ignore ) # type: ignore
assert "usage" in completion
assert isinstance(function_call, str)
assert stream is False # TODO: support stream mode
return llama_types.CreateChatCompletionResponse( return llama_types.CreateChatCompletionResponse(
id="chat" + completion["id"], id="chat" + completion["id"],
object="chat.completion", object="chat.completion",
@ -734,3 +749,119 @@ def functionary_chat_handler(
], ],
usage=completion["usage"], usage=completion["usage"],
) )
class Llava15ChatHandler:
def __init__(self, clip_model_path: str):
import llama_cpp.llava_cpp as llava_cpp
self._llava_cpp = llava_cpp
self.clip_model_path = clip_model_path
self.clip_ctx = self._llava_cpp.clip_model_load(self.clip_model_path.encode(), 0)
def __del__(self):
if self.clip_ctx is not None:
self._llava_cpp.clip_free(self.clip_ctx)
self.clip_ctx = None
def load_image(self, image_url: str) -> bytes:
if image_url.startswith("data:"):
import base64
image_bytes = base64.b64decode(image_url.split(",")[1])
return image_bytes
else:
import urllib.request
with urllib.request.urlopen(image_url) as f:
image_bytes = f.read()
return image_bytes
def __call__(
self,
*,
llama: llama.Llama,
messages: List[llama_types.ChatCompletionRequestMessage],
functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
tools: Optional[List[llama_types.ChatCompletionTool]] = None,
tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
temperature: float = 0.2,
top_p: float = 0.95,
top_k: int = 40,
stream: bool = False,
stop: Optional[Union[str, List[str]]] = [],
max_tokens: int = 256,
presence_penalty: float = 0.0,
frequency_penalty: float = 0.0,
repeat_penalty: float = 1.1,
tfs_z: float = 1.0,
mirostat_mode: int = 0,
mirostat_tau: float = 5.0,
mirostat_eta: float = 0.1,
model: Optional[str] = None,
logits_processor: Optional[llama.LogitsProcessorList] = None,
grammar: Optional[llama.LlamaGrammar] = None,
**kwargs, # type: ignore
) -> Union[llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse]]:
assert llama.context_params.logits_all is True # BUG: logits_all=True is required for llava
assert self.clip_ctx is not None
system_prompt = _get_system_message(messages)
system_prompt = system_prompt if system_prompt != "" else "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
user_role = "\nUSER:"
assistant_role = "\nASSISTANT:"
llama.reset()
llama.eval(llama.tokenize(system_prompt.encode("utf8"), add_bos=True))
for message in messages:
if message["role"] == "user" and message["content"] is not None:
if isinstance(message["content"], str):
llama.eval(llama.tokenize(f"{user_role} {message['content']}".encode("utf8"), add_bos=False))
else:
assert isinstance(message["content"], list)
llama.eval(llama.tokenize(f"{user_role} ".encode("utf8"), add_bos=False))
for content in message["content"]:
if content["type"] == "text":
llama.eval(llama.tokenize(f"{content['text']}".encode("utf8"), add_bos=False))
if content["type"] == "image_url":
image_bytes = self.load_image(content["image_url"]["url"]) if isinstance(content["image_url"], dict) else self.load_image(content["image_url"])
import array
data_array = array.array('B', image_bytes)
c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array)
embed = self._llava_cpp.llava_image_embed_make_with_bytes(ctx_clip=self.clip_ctx, n_threads=llama.context_params.n_threads, image_bytes=c_ubyte_ptr, image_bytes_length=len(image_bytes))
# image_bytes_p = (ctypes.c_uint8 * len(image_bytes)).from_buffer_copy(image_bytes)
# embed = self._llava_cpp.llava_image_embed_make_with_bytes(ctx_clip=self.clip_ctx, n_threads=1, image_bytes=image_bytes_p, image_bytes_length=len(image_bytes))
try:
n_past = ctypes.c_int(llama.n_tokens)
n_past_p = ctypes.pointer(n_past)
self._llava_cpp.llava_eval_image_embed(ctx_llama=llama.ctx, embed=embed, n_batch=llama.n_batch, n_past=n_past_p)
assert llama.n_ctx() >= n_past.value
llama.n_tokens = n_past.value
finally:
self._llava_cpp.llava_image_embed_free(embed)
if message["role"] == "assistant" and message["content"] is not None:
llama.eval(llama.tokenize(f"ASSISTANT: {message['content']}".encode("utf8"), add_bos=False))
llama.eval(llama.tokenize(f"{assistant_role}".encode("utf8"), add_bos=False))
prompt = llama._input_ids.tolist()
return _convert_completion_to_chat(llama.create_completion(
prompt=prompt,
temperature=temperature,
top_p=top_p,
top_k=top_k,
stream=stream,
stop=stop,
max_tokens=max_tokens,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
repeat_penalty=repeat_penalty,
tfs_z=tfs_z,
mirostat_mode=mirostat_mode,
mirostat_tau=mirostat_tau,
mirostat_eta=mirostat_eta,
model=model,
logits_processor=logits_processor,
grammar=grammar,
), stream=stream)

View file

@ -19,7 +19,7 @@ from typing import (
overload, overload,
) )
from . import llama_cpp import llama_cpp.llama_cpp as llama_cpp
# Type aliases # Type aliases
llama_grammar_element = llama_cpp.llama_grammar_element llama_grammar_element = llama_cpp.llama_grammar_element

View file

@ -1,4 +1,6 @@
"""Types and request signatrues for OpenAI compatibility """Types and request signatures for OpenAI compatibility
NOTE: These types may change to match the OpenAI OpenAPI specification.
Based on the OpenAI OpenAPI specification: Based on the OpenAI OpenAPI specification:
https://github.com/openai/openai-openapi/blob/master/openapi.yaml https://github.com/openai/openai-openapi/blob/master/openapi.yaml
@ -8,6 +10,12 @@ from typing import Any, List, Optional, Dict, Union
from typing_extensions import TypedDict, NotRequired, Literal from typing_extensions import TypedDict, NotRequired, Literal
# NOTE: Defining this correctly using annotations seems to break pydantic validation.
# This is a workaround until we can figure out how to do this correctly
# JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
JsonType = Union[None, int, str, bool, List[Any], Dict[str, Any]]
class EmbeddingUsage(TypedDict): class EmbeddingUsage(TypedDict):
prompt_tokens: int prompt_tokens: int
total_tokens: int total_tokens: int
@ -19,9 +27,6 @@ class Embedding(TypedDict):
embedding: List[float] embedding: List[float]
EmbeddingData = Embedding
class CreateEmbeddingResponse(TypedDict): class CreateEmbeddingResponse(TypedDict):
object: Literal["list"] object: Literal["list"]
model: str model: str
@ -49,110 +54,92 @@ class CompletionUsage(TypedDict):
total_tokens: int total_tokens: int
class CreateCompletionStreamResponse(TypedDict):
id: str
object: Literal["text_completion"]
created: int
model: str
choices: List[CompletionChoice]
CompletionChunk = CreateCompletionStreamResponse
class CreateCompletionResponse(TypedDict): class CreateCompletionResponse(TypedDict):
id: str id: str
object: Literal["text_completion"] object: Literal["text_completion"]
created: int created: int
model: str model: str
choices: List[CompletionChoice] choices: List[CompletionChoice]
usage: CompletionUsage usage: NotRequired[CompletionUsage]
Completion = CreateCompletionResponse class ChatCompletionResponseFunctionCall(TypedDict):
class ChatCompletionFunctionCall(TypedDict):
name: str name: str
arguments: str arguments: str
class ChatCompletionResponseMessage(TypedDict): class ChatCompletionResponseMessage(TypedDict):
role: Literal["assistant", "user", "system", "function"]
content: Optional[str] content: Optional[str]
user: NotRequired[str] tool_calls: NotRequired["ChatCompletionMessageToolCalls"]
function_call: NotRequired[ChatCompletionFunctionCall] role: Literal["assistant", "function"] # NOTE: "function" may be incorrect here
function_call: NotRequired[ChatCompletionResponseFunctionCall] # DEPRECATED
ChatCompletionMessage = ChatCompletionResponseMessage class ChatCompletionFunction(TypedDict):
class ChatCompletionResponseFunction(TypedDict):
name: str name: str
description: NotRequired[str] description: NotRequired[str]
parameters: Dict[str, Any] # TODO: make this more specific parameters: Dict[str, JsonType] # TODO: make this more specific
ChatCompletionFunction = ChatCompletionResponseFunction
class ChatCompletionResponseChoice(TypedDict): class ChatCompletionResponseChoice(TypedDict):
index: int index: int
message: ChatCompletionMessage message: "ChatCompletionResponseMessage"
finish_reason: Optional[str] finish_reason: Optional[str]
ChatCompletionChoice = ChatCompletionResponseChoice
class CreateChatCompletionResponse(TypedDict): class CreateChatCompletionResponse(TypedDict):
id: str id: str
object: Literal["chat.completion"] object: Literal["chat.completion"]
created: int created: int
model: str model: str
choices: List[ChatCompletionChoice] choices: List["ChatCompletionResponseChoice"]
usage: CompletionUsage usage: CompletionUsage
ChatCompletion = CreateChatCompletionResponse class ChatCompletionMessageToolCallChunkFunction(TypedDict):
name: str
arguments: str
class ChatCompletionMessageToolCallChunk(TypedDict):
index: int
id: NotRequired[str]
type: Literal["function"]
function: ChatCompletionMessageToolCallChunkFunction
class ChatCompletionStreamResponseDeltaEmpty(TypedDict): class ChatCompletionStreamResponseDeltaEmpty(TypedDict):
pass pass
ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
name: str
arguments: str
class ChatCompletionStreamResponseDelta(TypedDict): class ChatCompletionStreamResponseDelta(TypedDict):
role: NotRequired[Literal["assistant"]]
content: NotRequired[str] content: NotRequired[str]
function_call: NotRequired[ChatCompletionFunctionCall] function_call: NotRequired[
ChatCompletionStreamResponseDeltaFunctionCall
] # DEPRECATED
ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]]
role: NotRequired[Literal["system", "user", "assistant", "tool"]]
class ChatCompletionStreamResponseChoice(TypedDict): class ChatCompletionStreamResponseChoice(TypedDict):
index: int index: int
delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty] delta: Union[
ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
]
finish_reason: Optional[Literal["stop", "length", "function_call"]] finish_reason: Optional[Literal["stop", "length", "function_call"]]
ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice class CreateChatCompletionStreamResponse(TypedDict):
class ChatCompletionStreamResponse(TypedDict):
id: str id: str
model: str model: str
object: Literal["chat.completion.chunk"] object: Literal["chat.completion.chunk"]
created: int created: int
choices: List[ChatCompletionChunkChoice] choices: List[ChatCompletionStreamResponseChoice]
ChatCompletionChunk = ChatCompletionStreamResponse
JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
class ChatCompletionFunctions(TypedDict): class ChatCompletionFunctions(TypedDict):
@ -165,8 +152,137 @@ class ChatCompletionFunctionCallOption(TypedDict):
name: str name: str
class ChatCompletionRequestMessage(TypedDict): class ChatCompletionRequestMessageContentPartText(TypedDict):
role: Literal["assistant", "user", "system", "function"] type: Literal["text"]
text: str
class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict):
url: str
detail: NotRequired[Literal["auto", "low", "high"]]
class ChatCompletionRequestMessageContentPartImage(TypedDict):
type: Literal["image_url"]
image_url: Union[str, ChatCompletionRequestMessageContentPartImageImageUrl]
ChatCompletionRequestMessageContentPart = Union[
ChatCompletionRequestMessageContentPartText,
ChatCompletionRequestMessageContentPartImage,
]
class ChatCompletionRequestSystemMessage(TypedDict):
role: Literal["system"]
content: Optional[str] content: Optional[str]
name: NotRequired[str]
function_call: NotRequired[ChatCompletionFunctionCall]
class ChatCompletionRequestUserMessage(TypedDict):
role: Literal["user"]
content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]]
class ChatCompletionMessageToolCallFunction(TypedDict):
name: str
arguments: str
class ChatCompletionMessageToolCall(TypedDict):
id: str
type: Literal["function"]
function: ChatCompletionMessageToolCallFunction
ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall]
class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict):
name: str
arguments: str
class ChatCompletionRequestAssistantMessage(TypedDict):
role: Literal["assistant"]
content: Optional[str]
tool_calls: NotRequired[ChatCompletionMessageToolCalls]
function_call: NotRequired[
ChatCompletionRequestAssistantMessageFunctionCall
] # DEPRECATED
class ChatCompletionRequestToolMessage(TypedDict):
role: Literal["tool"]
content: Optional[str]
tool_call_id: str
class ChatCompletionRequestFunctionMessage(TypedDict):
role: Literal["function"]
content: Optional[str]
name: str
ChatCompletionRequestMessage = Union[
ChatCompletionRequestSystemMessage,
ChatCompletionRequestUserMessage,
ChatCompletionRequestAssistantMessage,
ChatCompletionRequestUserMessage,
ChatCompletionRequestToolMessage,
ChatCompletionRequestFunctionMessage,
]
class ChatCompletionRequestFunctionCallOption(TypedDict):
name: str
ChatCompletionRequestFunctionCall = Union[
Literal["none", "auto"], ChatCompletionRequestFunctionCallOption
]
ChatCompletionFunctionParameters = Dict[str, JsonType] # TODO: make this more specific
class ChatCompletionToolFunction(TypedDict):
name: str
description: NotRequired[str]
parameters: ChatCompletionFunctionParameters
class ChatCompletionTool(TypedDict):
type: Literal["function"]
function: ChatCompletionToolFunction
class ChatCompletionNamedToolChoiceFunction(TypedDict):
name: str
class ChatCompletionNamedToolChoice(TypedDict):
type: Literal["function"]
function: ChatCompletionNamedToolChoiceFunction
ChatCompletionToolChoiceOption = Union[
Literal["none", "auto"], ChatCompletionNamedToolChoice
]
# NOTE: The following type names are not part of the OpenAI OpenAPI specification
# and will be removed in a future major release.
EmbeddingData = Embedding
CompletionChunk = CreateCompletionResponse
Completion = CreateCompletionResponse
CreateCompletionStreamResponse = CreateCompletionResponse
ChatCompletionMessage = ChatCompletionResponseMessage
ChatCompletionChoice = ChatCompletionResponseChoice
ChatCompletion = CreateChatCompletionResponse
ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty
ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice
ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
ChatCompletionChunk = CreateChatCompletionStreamResponse
ChatCompletionStreamResponse = CreateChatCompletionStreamResponse
ChatCompletionResponseFunction = ChatCompletionFunction
ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall

232
llama_cpp/llava_cpp.py Normal file
View file

@ -0,0 +1,232 @@
import sys
import os
import ctypes
from ctypes import (
c_bool,
c_char_p,
c_int,
c_int8,
c_int32,
c_uint8,
c_uint32,
c_size_t,
c_float,
c_double,
c_void_p,
POINTER,
_Pointer, # type: ignore
Structure,
Array,
)
import pathlib
from typing import List, Union
import llama_cpp.llama_cpp as llama_cpp
# Load the library
def _load_shared_library(lib_base_name: str):
# Construct the paths to the possible shared library names
_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
# Searching for the library in the current directory under the name "libllama" (default name
# for llamacpp) and "llama" (default name for this repo)
_lib_paths: List[pathlib.Path] = []
# Determine the file extension based on the platform
if sys.platform.startswith("linux"):
_lib_paths += [
_base_path / f"lib{lib_base_name}.so",
]
elif sys.platform == "darwin":
_lib_paths += [
_base_path / f"lib{lib_base_name}.so",
_base_path / f"lib{lib_base_name}.dylib",
]
elif sys.platform == "win32":
_lib_paths += [
_base_path / f"{lib_base_name}.dll",
_base_path / f"lib{lib_base_name}.dll",
]
else:
raise RuntimeError("Unsupported platform")
if "LLAMA_CPP_LIB" in os.environ:
lib_base_name = os.environ["LLAMA_CPP_LIB"]
_lib = pathlib.Path(lib_base_name)
_base_path = _lib.parent.resolve()
_lib_paths = [_lib.resolve()]
cdll_args = dict() # type: ignore
# Add the library directory to the DLL search path on Windows (if needed)
if sys.platform == "win32" and sys.version_info >= (3, 8):
os.add_dll_directory(str(_base_path))
if "CUDA_PATH" in os.environ:
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
cdll_args["winmode"] = ctypes.RTLD_GLOBAL
# Try to load the shared library, handling potential errors
for _lib_path in _lib_paths:
if _lib_path.exists():
try:
return ctypes.CDLL(str(_lib_path), **cdll_args)
except Exception as e:
raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
raise FileNotFoundError(
f"Shared library with base name '{lib_base_name}' not found"
)
# Specify the base name of the shared library to load
_libllava_base_name = "llava"
# Load the library
_libllava = _load_shared_library(_libllava_base_name)
################################################
# llava.h
################################################
# struct clip_ctx;
clip_ctx_p = c_void_p
# struct llava_image_embed {
# float * embed;
# int n_image_pos;
# };
class llava_image_embed(Structure):
_fields_ = [
("embed", POINTER(c_float)),
("n_image_pos", c_int),
]
# /** sanity check for clip <-> llava embed size match */
# LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
def llava_validate_embed_size(ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p) -> bool:
return _libllava.llava_validate_embed_size(ctx_llama, ctx_clip)
_libllava.llava_validate_embed_size.argtypes = [llama_cpp.llama_context_p, clip_ctx_p]
_libllava.llava_validate_embed_size.restype = c_bool
# /** build an image embed from image file bytes */
# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_bytes: bytes, image_bytes_length: Union[c_int, int]) -> "_Pointer[llava_image_embed]":
return _libllava.llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length)
_libllava.llava_image_embed_make_with_bytes.argtypes = [clip_ctx_p, c_int, POINTER(c_uint8), c_int]
_libllava.llava_image_embed_make_with_bytes.restype = POINTER(llava_image_embed)
# /** build an image embed from a path to an image filename */
# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes) -> "_Pointer[llava_image_embed]":
return _libllava.llava_image_embed_make_with_filename(ctx_clip, n_threads, image_path)
_libllava.llava_image_embed_make_with_filename.argtypes = [clip_ctx_p, c_int, c_char_p]
_libllava.llava_image_embed_make_with_filename.restype = POINTER(llava_image_embed)
# LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
# /** free an embedding made with llava_image_embed_make_* */
def llava_image_embed_free(embed: "_Pointer[llava_image_embed]"):
return _libllava.llava_image_embed_free(embed)
_libllava.llava_image_embed_free.argtypes = [POINTER(llava_image_embed)]
_libllava.llava_image_embed_free.restype = None
# /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
# LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointer[llava_image_embed]", n_batch: Union[c_int, int], n_past: "_Pointer[c_int]") -> bool:
return _libllava.llava_eval_image_embed(ctx_llama, embed, n_batch, n_past)
_libllava.llava_eval_image_embed.argtypes = [llama_cpp.llama_context_p, POINTER(llava_image_embed), c_int, POINTER(c_int)]
_libllava.llava_eval_image_embed.restype = c_bool
################################################
# clip.h
################################################
# struct clip_vision_hparams {
# int32_t image_size;
# int32_t patch_size;
# int32_t hidden_size;
# int32_t n_intermediate;
# int32_t projection_dim;
# int32_t n_head;
# int32_t n_layer;
# float eps;
# };
class clip_vision_hparams(Structure):
_fields_ = [
("image_size", c_int32),
("patch_size", c_int32),
("hidden_size", c_int32),
("n_intermediate", c_int32),
("projection_dim", c_int32),
("n_head", c_int32),
("n_layer", c_int32),
("eps", c_float),
]
# /** load mmproj model */
# CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
def clip_model_load(fname: bytes, verbosity: Union[c_int, int]) -> clip_ctx_p:
return _libllava.clip_model_load(fname, verbosity)
_libllava.clip_model_load.argtypes = [c_char_p, c_int]
_libllava.clip_model_load.restype = clip_ctx_p
# /** free mmproj model */
# CLIP_API void clip_free(struct clip_ctx * ctx);
def clip_free(ctx: clip_ctx_p):
return _libllava.clip_free(ctx)
_libllava.clip_free.argtypes = [clip_ctx_p]
_libllava.clip_free.restype = None
# size_t clip_embd_nbytes(const struct clip_ctx * ctx);
# int clip_n_patches(const struct clip_ctx * ctx);
# int clip_n_mmproj_embd(const struct clip_ctx * ctx);
# // RGB uint8 image
# struct clip_image_u8 {
# int nx;
# int ny;
# uint8_t * data = NULL;
# size_t size;
# };
# // RGB float32 image (NHWC)
# // Memory layout: RGBRGBRGB...
# struct clip_image_f32 {
# int nx;
# int ny;
# float * data = NULL;
# size_t size;
# };
# struct clip_image_u8_batch {
# struct clip_image_u8 * data;
# size_t size;
# };
# struct clip_image_f32_batch {
# struct clip_image_f32 * data;
# size_t size;
# };
# struct clip_image_u8 * make_clip_image_u8();
# struct clip_image_f32 * make_clip_image_f32();
# CLIP_API void clip_image_u8_free(clip_image_u8 * img);
# CLIP_API void clip_image_f32_free(clip_image_f32 * img);
# CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
# bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
# bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
# bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
# float * vec);
# bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);

View file

@ -138,6 +138,10 @@ class Settings(BaseSettings):
default="llama-2", default="llama-2",
description="Chat format to use.", description="Chat format to use.",
) )
clip_model_path: Optional[str] = Field(
default=None,
description="Path to a CLIP model to use for multi-modal chat completion.",
)
# Cache Params # Cache Params
cache: bool = Field( cache: bool = Field(
default=False, default=False,
@ -375,6 +379,14 @@ def create_app(settings: Optional[Settings] = None):
) )
app.include_router(router) app.include_router(router)
global llama global llama
##
chat_handler = None
if settings.chat_format == "llava-1-5":
assert settings.clip_model_path is not None
chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path)
##
llama = llama_cpp.Llama( llama = llama_cpp.Llama(
model_path=settings.model, model_path=settings.model,
# Model Params # Model Params
@ -411,6 +423,7 @@ def create_app(settings: Optional[Settings] = None):
numa=settings.numa, numa=settings.numa,
# Chat Format Params # Chat Format Params
chat_format=settings.chat_format, chat_format=settings.chat_format,
chat_handler=chat_handler,
# Misc # Misc
verbose=settings.verbose, verbose=settings.verbose,
) )
@ -580,10 +593,6 @@ class CreateCompletionRequest(BaseModel):
max_tokens: int = max_tokens_field max_tokens: int = max_tokens_field
temperature: float = temperature_field temperature: float = temperature_field
top_p: float = top_p_field top_p: float = top_p_field
mirostat_mode: int = mirostat_mode_field
mirostat_tau: float = mirostat_tau_field
mirostat_eta: float = mirostat_eta_field
grammar: Optional[str] = None
echo: bool = Field( echo: bool = Field(
default=False, default=False,
description="Whether to echo the prompt in the generated text. Useful for chatbots.", description="Whether to echo the prompt in the generated text. Useful for chatbots.",
@ -610,6 +619,10 @@ class CreateCompletionRequest(BaseModel):
top_k: int = top_k_field top_k: int = top_k_field
repeat_penalty: float = repeat_penalty_field repeat_penalty: float = repeat_penalty_field
logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
mirostat_mode: int = mirostat_mode_field
mirostat_tau: float = mirostat_tau_field
mirostat_eta: float = mirostat_eta_field
grammar: Optional[str] = None
model_config = { model_config = {
"json_schema_extra": { "json_schema_extra": {
@ -688,7 +701,7 @@ async def create_completion(
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
iterator_or_completion: Union[ iterator_or_completion: Union[
llama_cpp.Completion, Iterator[llama_cpp.CompletionChunk] llama_cpp.CreateCompletionResponse, Iterator[llama_cpp.CreateCompletionStreamResponse]
] = await run_in_threadpool(llama, **kwargs) ] = await run_in_threadpool(llama, **kwargs)
if isinstance(iterator_or_completion, Iterator): if isinstance(iterator_or_completion, Iterator):
@ -697,7 +710,7 @@ async def create_completion(
# If no exception was raised from first_response, we can assume that # If no exception was raised from first_response, we can assume that
# the iterator is valid and we can use it to stream the response. # the iterator is valid and we can use it to stream the response.
def iterator() -> Iterator[llama_cpp.CompletionChunk]: def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
yield first_response yield first_response
yield from iterator_or_completion yield from iterator_or_completion
@ -748,27 +761,30 @@ class ChatCompletionRequestMessage(BaseModel):
) )
content: Optional[str] = Field(default="", description="The content of the message.") content: Optional[str] = Field(default="", description="The content of the message.")
from typing import Any
class CreateChatCompletionRequest(BaseModel): class CreateChatCompletionRequest(BaseModel):
messages: List[Any] = Field( messages: List[llama_cpp.ChatCompletionRequestMessage] = Field(
default=[], description="A list of messages to generate completions for." default=[], description="A list of messages to generate completions for."
) )
functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field( functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field(
default=None, default=None,
description="A list of functions to apply to the generated completions.", description="A list of functions to apply to the generated completions.",
) )
function_call: Optional[Union[Literal["auto", "none"], llama_cpp.ChatCompletionFunctionCallOption]] = Field( function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field(
default=None, default=None,
description="A function to apply to the generated completions.", description="A function to apply to the generated completions.",
) )
tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field(
default=None,
description="A list of tools to apply to the generated completions.",
)
tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field(
default=None,
description="A tool to apply to the generated completions.",
) # TODO: verify
max_tokens: int = max_tokens_field max_tokens: int = max_tokens_field
temperature: float = temperature_field temperature: float = temperature_field
top_p: float = top_p_field top_p: float = top_p_field
mirostat_mode: int = mirostat_mode_field
mirostat_tau: float = mirostat_tau_field
mirostat_eta: float = mirostat_eta_field
grammar: Optional[str] = None
stop: Optional[List[str]] = stop_field stop: Optional[List[str]] = stop_field
stream: bool = stream_field stream: bool = stream_field
presence_penalty: Optional[float] = presence_penalty_field presence_penalty: Optional[float] = presence_penalty_field
@ -784,6 +800,10 @@ class CreateChatCompletionRequest(BaseModel):
top_k: int = top_k_field top_k: int = top_k_field
repeat_penalty: float = repeat_penalty_field repeat_penalty: float = repeat_penalty_field
logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
mirostat_mode: int = mirostat_mode_field
mirostat_tau: float = mirostat_tau_field
mirostat_eta: float = mirostat_eta_field
grammar: Optional[str] = None
model_config = { model_config = {
"json_schema_extra": { "json_schema_extra": {

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 2833a6f63c1b87c7f4ac574bcf7a15a2f3bf3ede Subproject commit 381efbf480959bb6d1e247a8b0c2328f22e350f8