From f4491c4903f791b74de9e98e9dc03b87fcc9dfbb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 17 Jun 2024 11:56:03 -0400 Subject: [PATCH 01/10] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 6f37fcb..6d24324 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -301,6 +301,7 @@ LLAMA_VOCAB_TYPE_WPM = 3 # LLAMA_VOCAB_PRE_TYPE_OLMO = 12, # LLAMA_VOCAB_PRE_TYPE_DBRX = 13, # LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, +# LLAMA_VOCAB_PRE_TYPE_PORO = 15, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -317,6 +318,7 @@ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11 LLAMA_VOCAB_PRE_TYPE_OLMO = 12 LLAMA_VOCAB_PRE_TYPE_DBRX = 13 LLAMA_VOCAB_PRE_TYPE_SMAUG = 14 +LLAMA_VOCAB_PRE_TYPE_PORO = 15 # // note: these values should be synchronized with ggml_rope diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 172c825..b473e95 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 172c8256840ffd882ab9992ecedbb587d9b21f15 +Subproject commit b473e95084c286780165568cf0f385f21141d68d From 4c1d74c0aecb5fe263674cc761573654b16d4c20 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 19 Jun 2024 10:07:20 -0400 Subject: [PATCH 02/10] fix: Make destructor to automatically call .close() method on Llama class. --- llama_cpp/_internals.py | 9 +++++++++ llama_cpp/llama.py | 3 +++ 2 files changed, 12 insertions(+) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index ee990d4..423c1b5 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -64,6 +64,9 @@ class _LlamaModel: def close(self): self._exit_stack.close() + def __del__(self): + self.close() + def vocab_type(self) -> int: assert self.model is not None return llama_cpp.llama_vocab_type(self.model) @@ -292,6 +295,9 @@ class _LlamaContext: def close(self): self._exit_stack.close() + def __del__(self): + self.close() + def n_ctx(self) -> int: assert self.ctx is not None return llama_cpp.llama_n_ctx(self.ctx) @@ -531,6 +537,9 @@ class _LlamaBatch: def close(self): self._exit_stack.close() + def __del__(self): + self.close() + def n_tokens(self) -> int: assert self.batch is not None return self.batch.n_tokens diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 459b29f..5f53966 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1968,6 +1968,9 @@ class Llama: """Explicitly free the model from memory.""" self._stack.close() + def __del__(self) -> None: + self.close() + @staticmethod def logits_to_logprobs( logits: Union[npt.NDArray[np.single], List], axis: int = -1 From 554fd08e7d90f1c49cc513febdc79cc723ceb512 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 19 Jun 2024 10:07:28 -0400 Subject: [PATCH 03/10] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b473e95..9c77ec1 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b473e95084c286780165568cf0f385f21141d68d +Subproject commit 9c77ec1d74874ee22bdef8f110e8e8d41389abf2 From 6c331909ca365cb5aa490288a02853f1bb1d6979 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 19 Jun 2024 10:10:01 -0400 Subject: [PATCH 04/10] chore: Bump version --- CHANGELOG.md | 7 +++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b19985..42b14c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.79] + +- feat: Update llama.cpp to ggerganov/llama.cpp@9c77ec1d74874ee22bdef8f110e8e8d41389abf2 +- feat(ci): Update workflows and pre-built wheels by @Smartappli in #1416 +- feat: Add .close() method to Llama class to explicitly free model from memory by @jkawamoto in #1513 +- feat: Support SPM infill by @CISC in #1492 + ## [0.2.78] - feat: Update llama.cpp to ggerganov/llama.cpp@fd5ea0f897ecb3659d6c269ef6f3d833e865ead7 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 5dd2b43..71371b3 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.78" \ No newline at end of file +__version__ = "0.2.79" \ No newline at end of file From d98a24a25b49b02007071dc3e4bc2cc9a0212935 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 20 Jun 2024 10:50:40 -0400 Subject: [PATCH 05/10] docs: Remove references to deprecated opencl backend. Closes #1512 --- Makefile | 3 --- README.md | 11 ----------- 2 files changed, 14 deletions(-) diff --git a/Makefile b/Makefile index d8fb0cc..90f562f 100644 --- a/Makefile +++ b/Makefile @@ -24,9 +24,6 @@ build.debug: build.cuda: CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e . -build.opencl: - CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e . - build.openblas: CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e . diff --git a/README.md b/README.md index 0f7abfb..4c37ba3 100644 --- a/README.md +++ b/README.md @@ -165,17 +165,6 @@ pip install llama-cpp-python \ --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal ``` - -
- -CLBlast (OpenCL) - -To install with CLBlast, set the `LLAMA_CLBLAST=on` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python -``` -
From 5beec1a1fd28acf6e32d45a6b990ef6389d288ed Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 21 Jun 2024 12:09:14 -0400 Subject: [PATCH 06/10] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 9c77ec1..557b653 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 9c77ec1d74874ee22bdef8f110e8e8d41389abf2 +Subproject commit 557b653dc9ed91e8c313e87500e0050c775f81b6 From 27d53589ff884559919b4bd2c40690223feaaffa Mon Sep 17 00:00:00 2001 From: Jon Craton Date: Fri, 21 Jun 2024 12:10:15 -0400 Subject: [PATCH 07/10] docs: Update readme examples to use newer Qwen2 model (#1544) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4c37ba3..4a710d4 100644 --- a/README.md +++ b/README.md @@ -327,7 +327,7 @@ You'll need to install the `huggingface-hub` package to use this feature (`pip i ```python llm = Llama.from_pretrained( - repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", + repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF", filename="*q8_0.gguf", verbose=False ) @@ -688,7 +688,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_ If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub. ```bash -python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf' +python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf' ``` ### Web Server Features From 398fe81547d2b36a57730d19e66fb69184054958 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 21 Jun 2024 12:10:34 -0400 Subject: [PATCH 08/10] chore(deps): bump docker/build-push-action from 5 to 6 (#1539) Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 5 to 6. - [Release notes](https://github.com/docker/build-push-action/releases) - [Commits](https://github.com/docker/build-push-action/compare/v5...v6) --- updated-dependencies: - dependency-name: docker/build-push-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 4ebe3bb..b5c7346 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -31,7 +31,7 @@ jobs: - name: Build and push id: docker_build - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: . file: "docker/simple/Dockerfile" From 35c980eb2ece665a8d3b636968358bd82f3f27bd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 21 Jun 2024 12:10:43 -0400 Subject: [PATCH 09/10] chore(deps): bump pypa/cibuildwheel from 2.18.1 to 2.19.1 (#1527) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.18.1 to 2.19.1. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.18.1...v2.19.1) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build-and-release.yaml | 4 ++-- .github/workflows/build-wheels-metal.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index ebf52b3..34e9dc1 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -29,7 +29,7 @@ jobs: python -m pip install -e .[all] - name: Build wheels - uses: pypa/cibuildwheel@v2.19.0 + uses: pypa/cibuildwheel@v2.19.1 env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" @@ -56,7 +56,7 @@ jobs: platforms: linux/arm64 - name: Build wheels - uses: pypa/cibuildwheel@v2.19.0 + uses: pypa/cibuildwheel@v2.19.1 env: CIBW_SKIP: "*musllinux* pp*" CIBW_REPAIR_WHEEL_COMMAND: "" diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index f007eb3..11ab795 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -30,7 +30,7 @@ jobs: python -m pip install -e .[all] - name: Build wheels - uses: pypa/cibuildwheel@v2.18.1 + uses: pypa/cibuildwheel@v2.19.1 env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" From 04959f1884c8ef93bd5a4aa40ff0accb8438c0c1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 21 Jun 2024 16:56:15 -0400 Subject: [PATCH 10/10] feat: Update llama_cpp.py bindings --- llama_cpp/llama_cpp.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 6d24324..1116b1f 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -468,11 +468,13 @@ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN # LLAMA_POOLING_TYPE_NONE = 0, # LLAMA_POOLING_TYPE_MEAN = 1, # LLAMA_POOLING_TYPE_CLS = 2, +# LLAMA_POOLING_TYPE_LAST = 3, # }; LLAMA_POOLING_TYPE_UNSPECIFIED = -1 LLAMA_POOLING_TYPE_NONE = 0 LLAMA_POOLING_TYPE_MEAN = 1 LLAMA_POOLING_TYPE_CLS = 2 +LLAMA_POOLING_TYPE_LAST = 3 # enum llama_split_mode { # LLAMA_SPLIT_MODE_NONE = 0, // single GPU @@ -761,7 +763,6 @@ class llama_model_params(ctypes.Structure): # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id -# // (ignored if no pooling layer) # // ref: https://github.com/ggerganov/llama.cpp/pull/2054 # float rope_freq_base; // RoPE base frequency, 0 = from model @@ -2316,6 +2317,16 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int: ... +# // Set whether the model is in embeddings model or not +# // If true, embeddings will be returned but logits will not +# LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings); +@ctypes_function("llama_set_embeddings", [llama_context_p_ctypes, ctypes.c_bool], None) +def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /): + """Set whether the model is in embeddings model or not + If true, embeddings will be returned but logits will not""" + ... + + # // Set whether to use causal attention or not # // If set to true, the model will only attend to the past tokens # LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);