11 changed files with 23 additions and 41 deletions
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@ -29,7 +29,7 @@ jobs:
          python -m pip install -e .[all]

      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.19.1
+        uses: pypa/cibuildwheel@v2.19.0
        env:
          # disable repair
          CIBW_REPAIR_WHEEL_COMMAND: ""
@ -56,7 +56,7 @@ jobs:
          platforms: linux/arm64

      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.19.1
+        uses: pypa/cibuildwheel@v2.19.0
        env:
          CIBW_SKIP: "*musllinux* pp*"
          CIBW_REPAIR_WHEEL_COMMAND: ""
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@ -31,7 +31,7 @@ jobs:

      - name: Build and push
        id: docker_build
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        with:
          context: .
          file: "docker/simple/Dockerfile"
--- a/.github/workflows/build-wheels-metal.yaml
+++ b/.github/workflows/build-wheels-metal.yaml
@ -30,7 +30,7 @@ jobs:
          python -m pip install -e .[all]

      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.19.1
+        uses: pypa/cibuildwheel@v2.18.1
        env:
          # disable repair
          CIBW_REPAIR_WHEEL_COMMAND: ""
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,13 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

-## [0.2.79]
-
- feat: Update llama.cpp to ggerganov/llama.cpp@9c77ec1d74874ee22bdef8f110e8e8d41389abf2
- feat(ci): Update workflows and pre-built wheels by @Smartappli in #1416
- feat: Add .close() method to Llama class to explicitly free model from memory by @jkawamoto in #1513
- feat: Support SPM infill by @CISC in #1492
-
 ## [0.2.78]

 - feat: Update llama.cpp to ggerganov/llama.cpp@fd5ea0f897ecb3659d6c269ef6f3d833e865ead7
--- a/3
+++ b/3
@ -24,6 +24,9 @@ build.debug:
 build.cuda:
 	CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .

+build.opencl:
+	CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
+
 build.openblas:
 	CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e .

--- a/README.md
+++ b/README.md
@ -165,6 +165,17 @@ pip install llama-cpp-python \
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
 ```

+</details>
+<details>
+
+<summary>CLBlast (OpenCL)</summary>
+
+To install with CLBlast, set the `LLAMA_CLBLAST=on` environment variable before installing:
+
+```bash
+CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
+```
+
 </details>

 <details>
@ -327,7 +338,7 @@ You'll need to install the `huggingface-hub` package to use this feature (`pip i

 ```python
 llm = Llama.from_pretrained(
-    repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
+    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
    filename="*q8_0.gguf",
    verbose=False
 )
@ -688,7 +699,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
 If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.

 ```bash
-python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf'
+python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf'
 ```

 ### Web Server Features
--- a/llama_cpp/init.py
+++ b/llama_cpp/init.py
@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *

-__version__ = "0.2.79"
+__version__ = "0.2.78"
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@ -64,9 +64,6 @@ class _LlamaModel:
    def close(self):
        self._exit_stack.close()

-    def __del__(self):
-        self.close()
-
    def vocab_type(self) -> int:
        assert self.model is not None
        return llama_cpp.llama_vocab_type(self.model)
@ -295,9 +292,6 @@ class _LlamaContext:
    def close(self):
        self._exit_stack.close()

-    def __del__(self):
-        self.close()
-
    def n_ctx(self) -> int:
        assert self.ctx is not None
        return llama_cpp.llama_n_ctx(self.ctx)
@ -537,9 +531,6 @@ class _LlamaBatch:
    def close(self):
        self._exit_stack.close()

-    def __del__(self):
-        self.close()
-
    def n_tokens(self) -> int:
        assert self.batch is not None
        return self.batch.n_tokens
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -1968,9 +1968,6 @@ class Llama:
        """Explicitly free the model from memory."""
        self._stack.close()

-    def __del__(self) -> None:
-        self.close()
-
    @staticmethod
    def logits_to_logprobs(
        logits: Union[npt.NDArray[np.single], List], axis: int = -1
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -301,7 +301,6 @@ LLAMA_VOCAB_TYPE_WPM = 3
 #     LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
 #     LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
 #     LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
-#     LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
 # };
 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@ -318,7 +317,6 @@ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11
 LLAMA_VOCAB_PRE_TYPE_OLMO = 12
 LLAMA_VOCAB_PRE_TYPE_DBRX = 13
 LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
-LLAMA_VOCAB_PRE_TYPE_PORO = 15


 # // note: these values should be synchronized with ggml_rope
@ -468,13 +466,11 @@ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
 #     LLAMA_POOLING_TYPE_NONE = 0,
 #     LLAMA_POOLING_TYPE_MEAN = 1,
 #     LLAMA_POOLING_TYPE_CLS  = 2,
-#     LLAMA_POOLING_TYPE_LAST = 3,
 # };
 LLAMA_POOLING_TYPE_UNSPECIFIED = -1
 LLAMA_POOLING_TYPE_NONE = 0
 LLAMA_POOLING_TYPE_MEAN = 1
 LLAMA_POOLING_TYPE_CLS = 2
-LLAMA_POOLING_TYPE_LAST = 3   

 # enum llama_split_mode {
 #     LLAMA_SPLIT_MODE_NONE    = 0, // single GPU
@ -763,6 +759,7 @@ class llama_model_params(ctypes.Structure):

 #     enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 #     enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
+#                                                     // (ignored if no pooling layer)

 #     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
 #     float    rope_freq_base;   // RoPE base frequency, 0 = from model
@ -2317,16 +2314,6 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
    ...


-# // Set whether the model is in embeddings model or not
-# // If true, embeddings will be returned but logits will not
-# LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
-@ctypes_function("llama_set_embeddings", [llama_context_p_ctypes, ctypes.c_bool], None)
-def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /):
-    """Set whether the model is in embeddings model or not
-    If true, embeddings will be returned but logits will not"""
-    ...
-
-
 # // Set whether to use causal attention or not
 # // If set to true, the model will only attend to the past tokens
 # LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 557b653dc9ed91e8c313e87500e0050c775f81b6
+Subproject commit 172c8256840ffd882ab9992ecedbb587d9b21f15