10 changed files with 39 additions and 93 deletions
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@ -29,7 +29,7 @@ jobs:
          python -m pip install -e .[all]
      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.18.1
+        uses: pypa/cibuildwheel@v2.18.0
        env:
          # disable repair
          CIBW_REPAIR_WHEEL_COMMAND: ""
@ -56,7 +56,7 @@ jobs:
          platforms: linux/arm64
      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.18.1
+        uses: pypa/cibuildwheel@v2.18.0
        env:
          CIBW_SKIP: "*musllinux* pp*"
          CIBW_REPAIR_WHEEL_COMMAND: ""
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,12 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ## [0.2.76]
 - feat: Update llama.cpp to ggerganov/llama.cpp@0df0aa8e43c3378975269a51f9b876c8692e70da
 - feat: Improve Llama.eval performance by avoiding list conversion by @thoughtp0lice in #1476
 - example: LLM inference with Ray Serve by @rgerganov in #1465
 ## [0.2.75]
 - feat: Update llama.cpp to ggerganov/llama.cpp@13ad16af1231ab2d245d35df3295bcfa23de1305
--- a/8
+++ b/8
@ -13,13 +13,7 @@ build:
 	python3 -m pip install --verbose -e .
 build.debug:
-	python3 -m pip install \
+	CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false  --editable .
 		--verbose \
 		--config-settings=cmake.verbose=true \
 		--config-settings=logging.level=INFO \
 		--config-settings=install.strip=false  \
 		--config-settings=cmake.args="-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_C_FLAGS='-ggdb -O0';-DCMAKE_CXX_FLAGS='-ggdb -O0'" \
 		--editable .
 build.cuda:
 	CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .
--- a/README.md
+++ b/README.md
@ -499,16 +499,13 @@ llm = Llama.from_pretrained(
 `llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images.
-Below are the supported multi-modal models and their respective chat handlers (Python API) and chat formats (Server API).
+You'll first need to download one of the available multi-modal models in GGUF format:
-| Model | `LlamaChatHandler` | `chat_format` |
+- [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
-|:--- |:--- |:--- |
+- [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
-| [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) | `Llava15ChatHandler` | `llava-1-5` |
+- [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1)
-| [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) | `Llava15ChatHandler` | `llava-1-5` |
+- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf)
-| [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf) | `Llava16ChatHandler` | `llava-1-6` |
+- [moondream2](https://huggingface.co/vikhyatk/moondream2)
 | [moondream2](https://huggingface.co/vikhyatk/moondream2) | `MoondreamChatHandler` | `moondream2` |
 | [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` |
 | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
--- a/llama_cpp/init.py
+++ b/llama_cpp/init.py
@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.76"
+__version__ = "0.2.75"
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -6,7 +6,6 @@ import uuid
 import time
 import json
 import ctypes
 import typing
 import fnmatch
 import multiprocessing
@ -250,13 +249,13 @@ class Llama:
                self._kv_overrides_array[i].key = k.encode("utf-8")
                if isinstance(v, bool):
                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
-                    self._kv_overrides_array[i].value.val_bool = v
+                    self._kv_overrides_array[i].value.bool_value = v
                elif isinstance(v, int):
                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
-                    self._kv_overrides_array[i].value.val_i64 = v
+                    self._kv_overrides_array[i].value.int_value = v
                elif isinstance(v, float):
                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
-                    self._kv_overrides_array[i].value.val_f64 = v
+                    self._kv_overrides_array[i].value.float_value = v
                elif isinstance(v, str): # type: ignore
                    v_bytes = v.encode("utf-8")
                    if len(v_bytes) > 128: # TODO: Make this a constant
@ -264,12 +263,10 @@ class Llama:
                    v_bytes = v_bytes.ljust(128, b"\0")
                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
                    # copy min(v_bytes, 128) to str_value
                    address = typing.cast(int, ctypes.addressof(self._kv_overrides_array[i].value) + llama_cpp.llama_model_kv_override_value.val_str.offset)
                    buffer_start = ctypes.cast(address, ctypes.POINTER(ctypes.c_char))
                    ctypes.memmove(
-                        buffer_start,
+                        self._kv_overrides_array[i].value.str_value,
                        v_bytes,
-                        128,
+                        min(len(v_bytes), 128),
                    )
                else:
                    raise ValueError(f"Unknown value type for {k}: {v}")
@ -565,12 +562,12 @@ class Llama:
            if self.context_params.logits_all:
                rows = n_tokens
                cols = self._n_vocab
-                logits = np.ctypeslib.as_array(self._ctx.get_logits(), shape=(rows * cols, ))
+                logits = self._ctx.get_logits()[: rows * cols]
                self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits
            else:
                rows = 1
                cols = self._n_vocab
-                logits = np.ctypeslib.as_array(self._ctx.get_logits(), shape=(rows * cols, ))
+                logits = self._ctx.get_logits()[: rows * cols]
                self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits
            # Update n_tokens
            self.n_tokens += n_tokens
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -3098,7 +3098,7 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
        "{% endif %}"
    )
-class Llama3VisionAlphaChatHandler(Llava15ChatHandler):
+class Llama3VisionAlpha(Llava15ChatHandler):
    # question = "<image>" + q
    # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
@ -3159,10 +3159,6 @@ class Llama3VisionAlphaChatHandler(Llava15ChatHandler):
        "{% endif %}"
    )
 # alias
 Llama3VisionAlpha = Llama3VisionAlphaChatHandler
@register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
    llama: llama.Llama,
@ -3197,6 +3193,7 @@ def chatml_function_calling(
    llama_types.CreateChatCompletionResponse,
    Iterator[llama_types.CreateChatCompletionStreamResponse],
 ]:
    print(logprobs)
    function_calling_template = (
        "{% for message in messages %}"
        "<|im_start|>{{ message.role }}\n"
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -296,11 +296,9 @@ LLAMA_VOCAB_TYPE_WPM = 3
 #     LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
 #     LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
 #     LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
-#     LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
+#     LLAMA_VOCAB_PRE_TYPE_QWEN2          = 10,
-#     LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
+#     LLAMA_VOCAB_PRE_TYPE_OLMO           = 11,
-#     LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
+#     LLAMA_VOCAB_PRE_TYPE_DBRX           = 12,
 #     LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
 #     LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
 # };
 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@ -312,11 +310,9 @@ LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
 LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
 LLAMA_VOCAB_PRE_TYPE_REFACT = 8
 LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9
-LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10
+LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10
-LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11
+LLAMA_VOCAB_PRE_TYPE_OLMO = 11
-LLAMA_VOCAB_PRE_TYPE_OLMO = 12
+LLAMA_VOCAB_PRE_TYPE_DBRX = 12
 LLAMA_VOCAB_PRE_TYPE_DBRX = 13
 LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
 # // note: these values should be synchronized with ggml_rope
@ -613,17 +609,17 @@ LLAMA_KV_OVERRIDE_TYPE_STR = 3
 # };
 class llama_model_kv_override_value(ctypes.Union):
    _fields_ = [
-        ("val_i64", ctypes.c_int64),
+        ("int_value", ctypes.c_int64),
-        ("val_f64", ctypes.c_double),
+        ("float_value", ctypes.c_double),
-        ("val_bool", ctypes.c_bool),
+        ("bool_value", ctypes.c_bool),
-        ("val_str", ctypes.c_char * 128),
+        ("str_value", ctypes.c_char * 128),
    ]
    if TYPE_CHECKING:
-        val_i64: int
+        int_value: int
-        val_f64: float
+        float_value: float
-        val_bool: bool
+        bool_value: bool
-        val_str: bytes
+        str_value: bytes
 class llama_model_kv_override(ctypes.Structure):
@ -720,8 +716,6 @@ class llama_model_params(ctypes.Structure):
    ]
 # // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
 # //       https://github.com/ggerganov/llama.cpp/pull/7544
 # struct llama_context_params {
 #     uint32_t seed;              // RNG seed, -1 for random
 #     uint32_t n_ctx;             // text context, 0 = from model
@ -748,14 +742,15 @@ class llama_model_params(ctypes.Structure):
 #     ggml_backend_sched_eval_callback cb_eval;
 #     void * cb_eval_user_data;
-#     enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
+#     enum ggml_type type_k; // data type for K cache
-#     enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
+#     enum ggml_type type_v; // data type for V cache
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
 #     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-#     bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+#     bool flash_attn;  // whether to use flash attention
 #     // Abort callback
 #     // if it returns true, execution of llama_decode() will be aborted
@ -2268,22 +2263,6 @@ def llama_set_n_threads(
    ...
 # // Get the number of threads used for generation of a single token.
 # LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
@ctypes_function("llama_n_threads", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_threads(ctx: llama_context_p, /) -> int:
    """Get the number of threads used for generation of a single token"""
    ...
 # // Get the number of threads used for prompt and batch processing (multiple token).
 # LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
@ctypes_function("llama_n_threads_batch", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
    """Get the number of threads used for prompt and batch processing (multiple token)"""
    ...
 # // Set whether to use causal attention or not
 # // If set to true, the model will only attend to the past tokens
 # LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
@ -2457,16 +2436,6 @@ def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /)
    ...
 # // Identify if Token Id is a control token or a render-able token
 # LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
@ctypes_function(
    "llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool
 )
 def llama_token_is_control(model: llama_model_p, token: Union[llama_token, int], /) -> bool:
    """Identify if Token Id is a control token or a render-able token"""
    ...
 # // Special tokens
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@ -183,7 +183,7 @@ class LlamaProxy:
                num_pred_tokens=settings.draft_model_num_pred_tokens
            )
-        kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None
+        kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None
        if settings.kv_overrides is not None:
            assert isinstance(settings.kv_overrides, list)
            kv_overrides = {}
@ -197,8 +197,6 @@ class LlamaProxy:
                        kv_overrides[key] = int(value)
                    elif value_type == "float":
                        kv_overrides[key] = float(value)
                    elif value_type == "str":
                        kv_overrides[key] = value
                    else:
                        raise ValueError(f"Unknown value type {value_type}")
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 504f0c340f6b5e04de682f6ddefdd3b81208df5d
+Subproject commit 05834841dcb4f922983ea976539c70472272df9a
		`@ -1 +1 @@`
			`Subproject commit 504f0c340f6b5e04de682f6ddefdd3b81208df5d`				`Subproject commit 05834841dcb4f922983ea976539c70472272df9a`