diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index ebf52b3..34e9dc1 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -29,7 +29,7 @@ jobs:
python -m pip install -e .[all]
- name: Build wheels
- uses: pypa/cibuildwheel@v2.19.0
+ uses: pypa/cibuildwheel@v2.19.1
env:
# disable repair
CIBW_REPAIR_WHEEL_COMMAND: ""
@@ -56,7 +56,7 @@ jobs:
platforms: linux/arm64
- name: Build wheels
- uses: pypa/cibuildwheel@v2.19.0
+ uses: pypa/cibuildwheel@v2.19.1
env:
CIBW_SKIP: "*musllinux* pp*"
CIBW_REPAIR_WHEEL_COMMAND: ""
diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index 4ebe3bb..b5c7346 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -31,7 +31,7 @@ jobs:
- name: Build and push
id: docker_build
- uses: docker/build-push-action@v5
+ uses: docker/build-push-action@v6
with:
context: .
file: "docker/simple/Dockerfile"
diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml
index f007eb3..11ab795 100644
--- a/.github/workflows/build-wheels-metal.yaml
+++ b/.github/workflows/build-wheels-metal.yaml
@@ -30,7 +30,7 @@ jobs:
python -m pip install -e .[all]
- name: Build wheels
- uses: pypa/cibuildwheel@v2.18.1
+ uses: pypa/cibuildwheel@v2.19.1
env:
# disable repair
CIBW_REPAIR_WHEEL_COMMAND: ""
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7b19985..42b14c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+## [0.2.79]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@9c77ec1d74874ee22bdef8f110e8e8d41389abf2
+- feat(ci): Update workflows and pre-built wheels by @Smartappli in #1416
+- feat: Add .close() method to Llama class to explicitly free model from memory by @jkawamoto in #1513
+- feat: Support SPM infill by @CISC in #1492
+
## [0.2.78]
- feat: Update llama.cpp to ggerganov/llama.cpp@fd5ea0f897ecb3659d6c269ef6f3d833e865ead7
diff --git a/Makefile b/Makefile
index d8fb0cc..90f562f 100644
--- a/Makefile
+++ b/Makefile
@@ -24,9 +24,6 @@ build.debug:
build.cuda:
CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .
-build.opencl:
- CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
-
build.openblas:
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e .
diff --git a/README.md b/README.md
index 0f7abfb..4a710d4 100644
--- a/README.md
+++ b/README.md
@@ -165,17 +165,6 @@ pip install llama-cpp-python \
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
```
-
-
-
-CLBlast (OpenCL)
-
-To install with CLBlast, set the `LLAMA_CLBLAST=on` environment variable before installing:
-
-```bash
-CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
-```
-
@@ -338,7 +327,7 @@ You'll need to install the `huggingface-hub` package to use this feature (`pip i
```python
llm = Llama.from_pretrained(
- repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
+ repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
filename="*q8_0.gguf",
verbose=False
)
@@ -699,7 +688,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.
```bash
-python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf'
+python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf'
```
### Web Server Features
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 5dd2b43..71371b3 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *
-__version__ = "0.2.78"
\ No newline at end of file
+__version__ = "0.2.79"
\ No newline at end of file
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index ee990d4..423c1b5 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -64,6 +64,9 @@ class _LlamaModel:
def close(self):
self._exit_stack.close()
+ def __del__(self):
+ self.close()
+
def vocab_type(self) -> int:
assert self.model is not None
return llama_cpp.llama_vocab_type(self.model)
@@ -292,6 +295,9 @@ class _LlamaContext:
def close(self):
self._exit_stack.close()
+ def __del__(self):
+ self.close()
+
def n_ctx(self) -> int:
assert self.ctx is not None
return llama_cpp.llama_n_ctx(self.ctx)
@@ -531,6 +537,9 @@ class _LlamaBatch:
def close(self):
self._exit_stack.close()
+ def __del__(self):
+ self.close()
+
def n_tokens(self) -> int:
assert self.batch is not None
return self.batch.n_tokens
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 459b29f..5f53966 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1968,6 +1968,9 @@ class Llama:
"""Explicitly free the model from memory."""
self._stack.close()
+ def __del__(self) -> None:
+ self.close()
+
@staticmethod
def logits_to_logprobs(
logits: Union[npt.NDArray[np.single], List], axis: int = -1
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 6f37fcb..1116b1f 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -301,6 +301,7 @@ LLAMA_VOCAB_TYPE_WPM = 3
# LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
# LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
# LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
+# LLAMA_VOCAB_PRE_TYPE_PORO = 15,
# };
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -317,6 +318,7 @@ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11
LLAMA_VOCAB_PRE_TYPE_OLMO = 12
LLAMA_VOCAB_PRE_TYPE_DBRX = 13
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
+LLAMA_VOCAB_PRE_TYPE_PORO = 15
# // note: these values should be synchronized with ggml_rope
@@ -466,11 +468,13 @@ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
# LLAMA_POOLING_TYPE_NONE = 0,
# LLAMA_POOLING_TYPE_MEAN = 1,
# LLAMA_POOLING_TYPE_CLS = 2,
+# LLAMA_POOLING_TYPE_LAST = 3,
# };
LLAMA_POOLING_TYPE_UNSPECIFIED = -1
LLAMA_POOLING_TYPE_NONE = 0
LLAMA_POOLING_TYPE_MEAN = 1
LLAMA_POOLING_TYPE_CLS = 2
+LLAMA_POOLING_TYPE_LAST = 3
# enum llama_split_mode {
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
@@ -759,7 +763,6 @@ class llama_model_params(ctypes.Structure):
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
-# // (ignored if no pooling layer)
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
# float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -2314,6 +2317,16 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
...
+# // Set whether the model is in embeddings model or not
+# // If true, embeddings will be returned but logits will not
+# LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
+@ctypes_function("llama_set_embeddings", [llama_context_p_ctypes, ctypes.c_bool], None)
+def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /):
+ """Set whether the model is in embeddings model or not
+ If true, embeddings will be returned but logits will not"""
+ ...
+
+
# // Set whether to use causal attention or not
# // If set to true, the model will only attend to the past tokens
# LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 172c825..557b653 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 172c8256840ffd882ab9992ecedbb587d9b21f15
+Subproject commit 557b653dc9ed91e8c313e87500e0050c775f81b6