Compare commits

...

16 commits

Author SHA1 Message Date
fc2af04c15
Merge https://github.com/abetlen/llama-cpp-python 2024-05-30 08:20:49 +05:30
Andrei Betlen
165b4dc6c1 fix: Fix typo in Llama3VisionAlphaChatHandler. Closes #1488 2024-05-29 02:29:44 -04:00
Andrei Betlen
91d05aba46 fix: adjust kv_override member names to match llama.cpp 2024-05-29 02:28:58 -04:00
Andrei Betlen
df45a4b3fe fix: fix string value kv_overrides. Closes #1487 2024-05-29 02:02:22 -04:00
Andrei Betlen
10b7c50cd2 Merge branch 'main' of https://github.com/abetlen/llama-cpp-python into main 2024-05-28 22:52:30 -04:00
Andrei Betlen
2907c26906 misc: Update debug build to keep all debug symbols for easier gdb debugging 2024-05-28 22:52:28 -04:00
Andrei Betlen
c26004b1be feat: Update llama.cpp 2024-05-28 22:52:03 -04:00
dependabot[bot]
c564007ff6
chore(deps): bump pypa/cibuildwheel from 2.18.0 to 2.18.1 (#1472)
updated-dependencies:
- dependency-name: pypa/cibuildwheel
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Andrei <abetlen@gmail.com>
2024-05-27 10:57:17 -04:00
Andrei Betlen
454c9bb1cb feat: Update llama.cpp 2024-05-27 10:51:57 -04:00
Andrei Betlen
2d89964147 docs: Fix table formatting 2024-05-24 11:55:41 -04:00
Andrei Betlen
9e8d7d55bd fix(docs): Fix link typo 2024-05-24 11:55:01 -04:00
Andrei Betlen
ec43e8920f docs: Update multi-modal model section 2024-05-24 11:54:15 -04:00
Andrei Betlen
a4c9ab885d chore: Bump version 2024-05-24 01:59:25 -04:00
Linghan Zhong
5cae1040e3
feat: Improve Llama.eval performance by avoiding list conversion (#1476)
Co-authored-by: Andrei <abetlen@gmail.com>
2024-05-24 01:49:44 -04:00
Andrei Betlen
087cc0b036 feat: Update llama.cpp 2024-05-24 01:43:36 -04:00
Andrei Betlen
5a595f035a feat: Update llama.cpp 2024-05-22 02:40:31 -04:00
10 changed files with 93 additions and 39 deletions

View file

@ -29,7 +29,7 @@ jobs:
python -m pip install -e .[all] python -m pip install -e .[all]
- name: Build wheels - name: Build wheels
uses: pypa/cibuildwheel@v2.18.0 uses: pypa/cibuildwheel@v2.18.1
env: env:
# disable repair # disable repair
CIBW_REPAIR_WHEEL_COMMAND: "" CIBW_REPAIR_WHEEL_COMMAND: ""
@ -56,7 +56,7 @@ jobs:
platforms: linux/arm64 platforms: linux/arm64
- name: Build wheels - name: Build wheels
uses: pypa/cibuildwheel@v2.18.0 uses: pypa/cibuildwheel@v2.18.1
env: env:
CIBW_SKIP: "*musllinux* pp*" CIBW_SKIP: "*musllinux* pp*"
CIBW_REPAIR_WHEEL_COMMAND: "" CIBW_REPAIR_WHEEL_COMMAND: ""

View file

@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
## [0.2.76]
- feat: Update llama.cpp to ggerganov/llama.cpp@0df0aa8e43c3378975269a51f9b876c8692e70da
- feat: Improve Llama.eval performance by avoiding list conversion by @thoughtp0lice in #1476
- example: LLM inference with Ray Serve by @rgerganov in #1465
## [0.2.75] ## [0.2.75]
- feat: Update llama.cpp to ggerganov/llama.cpp@13ad16af1231ab2d245d35df3295bcfa23de1305 - feat: Update llama.cpp to ggerganov/llama.cpp@13ad16af1231ab2d245d35df3295bcfa23de1305

View file

@ -13,7 +13,13 @@ build:
python3 -m pip install --verbose -e . python3 -m pip install --verbose -e .
build.debug: build.debug:
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false --editable . python3 -m pip install \
--verbose \
--config-settings=cmake.verbose=true \
--config-settings=logging.level=INFO \
--config-settings=install.strip=false \
--config-settings=cmake.args="-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_C_FLAGS='-ggdb -O0';-DCMAKE_CXX_FLAGS='-ggdb -O0'" \
--editable .
build.cuda: build.cuda:
CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e . CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .

View file

@ -499,13 +499,16 @@ llm = Llama.from_pretrained(
`llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images. `llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images.
You'll first need to download one of the available multi-modal models in GGUF format: Below are the supported multi-modal models and their respective chat handlers (Python API) and chat formats (Server API).
- [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) | Model | `LlamaChatHandler` | `chat_format` |
- [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) |:--- |:--- |:--- |
- [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1) | [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) | `Llava15ChatHandler` | `llava-1-5` |
- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf) | [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) | `Llava15ChatHandler` | `llava-1-5` |
- [moondream2](https://huggingface.co/vikhyatk/moondream2) | [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf) | `Llava16ChatHandler` | `llava-1-6` |
| [moondream2](https://huggingface.co/vikhyatk/moondream2) | `MoondreamChatHandler` | `moondream2` |
| [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` |
| [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.

View file

@ -1,4 +1,4 @@
from .llama_cpp import * from .llama_cpp import *
from .llama import * from .llama import *
__version__ = "0.2.75" __version__ = "0.2.76"

View file

@ -6,6 +6,7 @@ import uuid
import time import time
import json import json
import ctypes import ctypes
import typing
import fnmatch import fnmatch
import multiprocessing import multiprocessing
@ -249,13 +250,13 @@ class Llama:
self._kv_overrides_array[i].key = k.encode("utf-8") self._kv_overrides_array[i].key = k.encode("utf-8")
if isinstance(v, bool): if isinstance(v, bool):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
self._kv_overrides_array[i].value.bool_value = v self._kv_overrides_array[i].value.val_bool = v
elif isinstance(v, int): elif isinstance(v, int):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
self._kv_overrides_array[i].value.int_value = v self._kv_overrides_array[i].value.val_i64 = v
elif isinstance(v, float): elif isinstance(v, float):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
self._kv_overrides_array[i].value.float_value = v self._kv_overrides_array[i].value.val_f64 = v
elif isinstance(v, str): # type: ignore elif isinstance(v, str): # type: ignore
v_bytes = v.encode("utf-8") v_bytes = v.encode("utf-8")
if len(v_bytes) > 128: # TODO: Make this a constant if len(v_bytes) > 128: # TODO: Make this a constant
@ -263,10 +264,12 @@ class Llama:
v_bytes = v_bytes.ljust(128, b"\0") v_bytes = v_bytes.ljust(128, b"\0")
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
# copy min(v_bytes, 128) to str_value # copy min(v_bytes, 128) to str_value
address = typing.cast(int, ctypes.addressof(self._kv_overrides_array[i].value) + llama_cpp.llama_model_kv_override_value.val_str.offset)
buffer_start = ctypes.cast(address, ctypes.POINTER(ctypes.c_char))
ctypes.memmove( ctypes.memmove(
self._kv_overrides_array[i].value.str_value, buffer_start,
v_bytes, v_bytes,
min(len(v_bytes), 128), 128,
) )
else: else:
raise ValueError(f"Unknown value type for {k}: {v}") raise ValueError(f"Unknown value type for {k}: {v}")
@ -562,12 +565,12 @@ class Llama:
if self.context_params.logits_all: if self.context_params.logits_all:
rows = n_tokens rows = n_tokens
cols = self._n_vocab cols = self._n_vocab
logits = self._ctx.get_logits()[: rows * cols] logits = np.ctypeslib.as_array(self._ctx.get_logits(), shape=(rows * cols, ))
self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits
else: else:
rows = 1 rows = 1
cols = self._n_vocab cols = self._n_vocab
logits = self._ctx.get_logits()[: rows * cols] logits = np.ctypeslib.as_array(self._ctx.get_logits(), shape=(rows * cols, ))
self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits
# Update n_tokens # Update n_tokens
self.n_tokens += n_tokens self.n_tokens += n_tokens

View file

@ -3098,7 +3098,7 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
"{% endif %}" "{% endif %}"
) )
class Llama3VisionAlpha(Llava15ChatHandler): class Llama3VisionAlphaChatHandler(Llava15ChatHandler):
# question = "<image>" + q # question = "<image>" + q
# prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
@ -3159,6 +3159,10 @@ class Llama3VisionAlpha(Llava15ChatHandler):
"{% endif %}" "{% endif %}"
) )
# alias
Llama3VisionAlpha = Llama3VisionAlphaChatHandler
@register_chat_completion_handler("chatml-function-calling") @register_chat_completion_handler("chatml-function-calling")
def chatml_function_calling( def chatml_function_calling(
llama: llama.Llama, llama: llama.Llama,
@ -3193,7 +3197,6 @@ def chatml_function_calling(
llama_types.CreateChatCompletionResponse, llama_types.CreateChatCompletionResponse,
Iterator[llama_types.CreateChatCompletionStreamResponse], Iterator[llama_types.CreateChatCompletionStreamResponse],
]: ]:
print(logprobs)
function_calling_template = ( function_calling_template = (
"{% for message in messages %}" "{% for message in messages %}"
"<|im_start|>{{ message.role }}\n" "<|im_start|>{{ message.role }}\n"

View file

@ -296,9 +296,11 @@ LLAMA_VOCAB_TYPE_WPM = 3
# LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, # LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
# LLAMA_VOCAB_PRE_TYPE_REFACT = 8, # LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
# LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, # LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
# LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10, # LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
# LLAMA_VOCAB_PRE_TYPE_OLMO = 11, # LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
# LLAMA_VOCAB_PRE_TYPE_DBRX = 12, # LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
# LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
# LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
# }; # };
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@ -310,9 +312,11 @@ LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7 LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
LLAMA_VOCAB_PRE_TYPE_REFACT = 8 LLAMA_VOCAB_PRE_TYPE_REFACT = 8
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9 LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10 LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10
LLAMA_VOCAB_PRE_TYPE_OLMO = 11 LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11
LLAMA_VOCAB_PRE_TYPE_DBRX = 12 LLAMA_VOCAB_PRE_TYPE_OLMO = 12
LLAMA_VOCAB_PRE_TYPE_DBRX = 13
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
# // note: these values should be synchronized with ggml_rope # // note: these values should be synchronized with ggml_rope
@ -609,17 +613,17 @@ LLAMA_KV_OVERRIDE_TYPE_STR = 3
# }; # };
class llama_model_kv_override_value(ctypes.Union): class llama_model_kv_override_value(ctypes.Union):
_fields_ = [ _fields_ = [
("int_value", ctypes.c_int64), ("val_i64", ctypes.c_int64),
("float_value", ctypes.c_double), ("val_f64", ctypes.c_double),
("bool_value", ctypes.c_bool), ("val_bool", ctypes.c_bool),
("str_value", ctypes.c_char * 128), ("val_str", ctypes.c_char * 128),
] ]
if TYPE_CHECKING: if TYPE_CHECKING:
int_value: int val_i64: int
float_value: float val_f64: float
bool_value: bool val_bool: bool
str_value: bytes val_str: bytes
class llama_model_kv_override(ctypes.Structure): class llama_model_kv_override(ctypes.Structure):
@ -716,6 +720,8 @@ class llama_model_params(ctypes.Structure):
] ]
# // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
# // https://github.com/ggerganov/llama.cpp/pull/7544
# struct llama_context_params { # struct llama_context_params {
# uint32_t seed; // RNG seed, -1 for random # uint32_t seed; // RNG seed, -1 for random
# uint32_t n_ctx; // text context, 0 = from model # uint32_t n_ctx; // text context, 0 = from model
@ -742,15 +748,14 @@ class llama_model_params(ctypes.Structure):
# ggml_backend_sched_eval_callback cb_eval; # ggml_backend_sched_eval_callback cb_eval;
# void * cb_eval_user_data; # void * cb_eval_user_data;
# enum ggml_type type_k; // data type for K cache # enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
# enum ggml_type type_v; // data type for V cache # enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
# // Keep the booleans together to avoid misalignment during copy-by-value. # // Keep the booleans together to avoid misalignment during copy-by-value.
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
# bool embeddings; // if true, extract embeddings (together with logits) # bool embeddings; // if true, extract embeddings (together with logits)
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
# bool flash_attn; // whether to use flash attention # bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
# // Abort callback # // Abort callback
# // if it returns true, execution of llama_decode() will be aborted # // if it returns true, execution of llama_decode() will be aborted
@ -2263,6 +2268,22 @@ def llama_set_n_threads(
... ...
# // Get the number of threads used for generation of a single token.
# LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
@ctypes_function("llama_n_threads", [llama_context_p_ctypes], ctypes.c_uint32)
def llama_n_threads(ctx: llama_context_p, /) -> int:
"""Get the number of threads used for generation of a single token"""
...
# // Get the number of threads used for prompt and batch processing (multiple token).
# LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
@ctypes_function("llama_n_threads_batch", [llama_context_p_ctypes], ctypes.c_uint32)
def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
"""Get the number of threads used for prompt and batch processing (multiple token)"""
...
# // Set whether to use causal attention or not # // Set whether to use causal attention or not
# // If set to true, the model will only attend to the past tokens # // If set to true, the model will only attend to the past tokens
# LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); # LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
@ -2436,6 +2457,16 @@ def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /)
... ...
# // Identify if Token Id is a control token or a render-able token
# LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
@ctypes_function(
"llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool
)
def llama_token_is_control(model: llama_model_p, token: Union[llama_token, int], /) -> bool:
"""Identify if Token Id is a control token or a render-able token"""
...
# // Special tokens # // Special tokens

View file

@ -183,7 +183,7 @@ class LlamaProxy:
num_pred_tokens=settings.draft_model_num_pred_tokens num_pred_tokens=settings.draft_model_num_pred_tokens
) )
kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None
if settings.kv_overrides is not None: if settings.kv_overrides is not None:
assert isinstance(settings.kv_overrides, list) assert isinstance(settings.kv_overrides, list)
kv_overrides = {} kv_overrides = {}
@ -197,6 +197,8 @@ class LlamaProxy:
kv_overrides[key] = int(value) kv_overrides[key] = int(value)
elif value_type == "float": elif value_type == "float":
kv_overrides[key] = float(value) kv_overrides[key] = float(value)
elif value_type == "str":
kv_overrides[key] = value
else: else:
raise ValueError(f"Unknown value type {value_type}") raise ValueError(f"Unknown value type {value_type}")

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 05834841dcb4f922983ea976539c70472272df9a Subproject commit 504f0c340f6b5e04de682f6ddefdd3b81208df5d