Compare commits
11 commits
5f5ea0a49c
...
e9b337b312
Author | SHA1 | Date | |
---|---|---|---|
e9b337b312 | |||
|
04959f1884 | ||
|
35c980eb2e | ||
|
398fe81547 | ||
|
27d53589ff | ||
|
5beec1a1fd | ||
|
d98a24a25b | ||
|
6c331909ca | ||
|
554fd08e7d | ||
|
4c1d74c0ae | ||
|
f4491c4903 |
11 changed files with 41 additions and 23 deletions
4
.github/workflows/build-and-release.yaml
vendored
4
.github/workflows/build-and-release.yaml
vendored
|
@ -29,7 +29,7 @@ jobs:
|
||||||
python -m pip install -e .[all]
|
python -m pip install -e .[all]
|
||||||
|
|
||||||
- name: Build wheels
|
- name: Build wheels
|
||||||
uses: pypa/cibuildwheel@v2.19.0
|
uses: pypa/cibuildwheel@v2.19.1
|
||||||
env:
|
env:
|
||||||
# disable repair
|
# disable repair
|
||||||
CIBW_REPAIR_WHEEL_COMMAND: ""
|
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||||
|
@ -56,7 +56,7 @@ jobs:
|
||||||
platforms: linux/arm64
|
platforms: linux/arm64
|
||||||
|
|
||||||
- name: Build wheels
|
- name: Build wheels
|
||||||
uses: pypa/cibuildwheel@v2.19.0
|
uses: pypa/cibuildwheel@v2.19.1
|
||||||
env:
|
env:
|
||||||
CIBW_SKIP: "*musllinux* pp*"
|
CIBW_SKIP: "*musllinux* pp*"
|
||||||
CIBW_REPAIR_WHEEL_COMMAND: ""
|
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||||
|
|
2
.github/workflows/build-docker.yaml
vendored
2
.github/workflows/build-docker.yaml
vendored
|
@ -31,7 +31,7 @@ jobs:
|
||||||
|
|
||||||
- name: Build and push
|
- name: Build and push
|
||||||
id: docker_build
|
id: docker_build
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
file: "docker/simple/Dockerfile"
|
file: "docker/simple/Dockerfile"
|
||||||
|
|
2
.github/workflows/build-wheels-metal.yaml
vendored
2
.github/workflows/build-wheels-metal.yaml
vendored
|
@ -30,7 +30,7 @@ jobs:
|
||||||
python -m pip install -e .[all]
|
python -m pip install -e .[all]
|
||||||
|
|
||||||
- name: Build wheels
|
- name: Build wheels
|
||||||
uses: pypa/cibuildwheel@v2.18.1
|
uses: pypa/cibuildwheel@v2.19.1
|
||||||
env:
|
env:
|
||||||
# disable repair
|
# disable repair
|
||||||
CIBW_REPAIR_WHEEL_COMMAND: ""
|
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||||
|
|
|
@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.2.79]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@9c77ec1d74874ee22bdef8f110e8e8d41389abf2
|
||||||
|
- feat(ci): Update workflows and pre-built wheels by @Smartappli in #1416
|
||||||
|
- feat: Add .close() method to Llama class to explicitly free model from memory by @jkawamoto in #1513
|
||||||
|
- feat: Support SPM infill by @CISC in #1492
|
||||||
|
|
||||||
## [0.2.78]
|
## [0.2.78]
|
||||||
|
|
||||||
- feat: Update llama.cpp to ggerganov/llama.cpp@fd5ea0f897ecb3659d6c269ef6f3d833e865ead7
|
- feat: Update llama.cpp to ggerganov/llama.cpp@fd5ea0f897ecb3659d6c269ef6f3d833e865ead7
|
||||||
|
|
3
Makefile
3
Makefile
|
@ -24,9 +24,6 @@ build.debug:
|
||||||
build.cuda:
|
build.cuda:
|
||||||
CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .
|
CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .
|
||||||
|
|
||||||
build.opencl:
|
|
||||||
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
|
|
||||||
|
|
||||||
build.openblas:
|
build.openblas:
|
||||||
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e .
|
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e .
|
||||||
|
|
||||||
|
|
15
README.md
15
README.md
|
@ -165,17 +165,6 @@ pip install llama-cpp-python \
|
||||||
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
|
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
|
||||||
<details>
|
|
||||||
|
|
||||||
<summary>CLBlast (OpenCL)</summary>
|
|
||||||
|
|
||||||
To install with CLBlast, set the `LLAMA_CLBLAST=on` environment variable before installing:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
|
@ -338,7 +327,7 @@ You'll need to install the `huggingface-hub` package to use this feature (`pip i
|
||||||
|
|
||||||
```python
|
```python
|
||||||
llm = Llama.from_pretrained(
|
llm = Llama.from_pretrained(
|
||||||
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
|
repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
|
||||||
filename="*q8_0.gguf",
|
filename="*q8_0.gguf",
|
||||||
verbose=False
|
verbose=False
|
||||||
)
|
)
|
||||||
|
@ -699,7 +688,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
|
||||||
If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.
|
If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf'
|
python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf'
|
||||||
```
|
```
|
||||||
|
|
||||||
### Web Server Features
|
### Web Server Features
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .llama_cpp import *
|
from .llama_cpp import *
|
||||||
from .llama import *
|
from .llama import *
|
||||||
|
|
||||||
__version__ = "0.2.78"
|
__version__ = "0.2.79"
|
|
@ -64,6 +64,9 @@ class _LlamaModel:
|
||||||
def close(self):
|
def close(self):
|
||||||
self._exit_stack.close()
|
self._exit_stack.close()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
self.close()
|
||||||
|
|
||||||
def vocab_type(self) -> int:
|
def vocab_type(self) -> int:
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
return llama_cpp.llama_vocab_type(self.model)
|
return llama_cpp.llama_vocab_type(self.model)
|
||||||
|
@ -292,6 +295,9 @@ class _LlamaContext:
|
||||||
def close(self):
|
def close(self):
|
||||||
self._exit_stack.close()
|
self._exit_stack.close()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
self.close()
|
||||||
|
|
||||||
def n_ctx(self) -> int:
|
def n_ctx(self) -> int:
|
||||||
assert self.ctx is not None
|
assert self.ctx is not None
|
||||||
return llama_cpp.llama_n_ctx(self.ctx)
|
return llama_cpp.llama_n_ctx(self.ctx)
|
||||||
|
@ -531,6 +537,9 @@ class _LlamaBatch:
|
||||||
def close(self):
|
def close(self):
|
||||||
self._exit_stack.close()
|
self._exit_stack.close()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
self.close()
|
||||||
|
|
||||||
def n_tokens(self) -> int:
|
def n_tokens(self) -> int:
|
||||||
assert self.batch is not None
|
assert self.batch is not None
|
||||||
return self.batch.n_tokens
|
return self.batch.n_tokens
|
||||||
|
|
|
@ -1968,6 +1968,9 @@ class Llama:
|
||||||
"""Explicitly free the model from memory."""
|
"""Explicitly free the model from memory."""
|
||||||
self._stack.close()
|
self._stack.close()
|
||||||
|
|
||||||
|
def __del__(self) -> None:
|
||||||
|
self.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def logits_to_logprobs(
|
def logits_to_logprobs(
|
||||||
logits: Union[npt.NDArray[np.single], List], axis: int = -1
|
logits: Union[npt.NDArray[np.single], List], axis: int = -1
|
||||||
|
|
|
@ -301,6 +301,7 @@ LLAMA_VOCAB_TYPE_WPM = 3
|
||||||
# LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
# LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||||
# LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
# LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||||
# LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
# LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||||
|
# LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
||||||
# };
|
# };
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
|
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
|
||||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
|
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
|
||||||
|
@ -317,6 +318,7 @@ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11
|
||||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12
|
LLAMA_VOCAB_PRE_TYPE_OLMO = 12
|
||||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13
|
LLAMA_VOCAB_PRE_TYPE_DBRX = 13
|
||||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
|
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_PORO = 15
|
||||||
|
|
||||||
|
|
||||||
# // note: these values should be synchronized with ggml_rope
|
# // note: these values should be synchronized with ggml_rope
|
||||||
|
@ -466,11 +468,13 @@ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
|
||||||
# LLAMA_POOLING_TYPE_NONE = 0,
|
# LLAMA_POOLING_TYPE_NONE = 0,
|
||||||
# LLAMA_POOLING_TYPE_MEAN = 1,
|
# LLAMA_POOLING_TYPE_MEAN = 1,
|
||||||
# LLAMA_POOLING_TYPE_CLS = 2,
|
# LLAMA_POOLING_TYPE_CLS = 2,
|
||||||
|
# LLAMA_POOLING_TYPE_LAST = 3,
|
||||||
# };
|
# };
|
||||||
LLAMA_POOLING_TYPE_UNSPECIFIED = -1
|
LLAMA_POOLING_TYPE_UNSPECIFIED = -1
|
||||||
LLAMA_POOLING_TYPE_NONE = 0
|
LLAMA_POOLING_TYPE_NONE = 0
|
||||||
LLAMA_POOLING_TYPE_MEAN = 1
|
LLAMA_POOLING_TYPE_MEAN = 1
|
||||||
LLAMA_POOLING_TYPE_CLS = 2
|
LLAMA_POOLING_TYPE_CLS = 2
|
||||||
|
LLAMA_POOLING_TYPE_LAST = 3
|
||||||
|
|
||||||
# enum llama_split_mode {
|
# enum llama_split_mode {
|
||||||
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||||
|
@ -759,7 +763,6 @@ class llama_model_params(ctypes.Structure):
|
||||||
|
|
||||||
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||||
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||||
# // (ignored if no pooling layer)
|
|
||||||
|
|
||||||
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
# float rope_freq_base; // RoPE base frequency, 0 = from model
|
# float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
|
@ -2314,6 +2317,16 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# // Set whether the model is in embeddings model or not
|
||||||
|
# // If true, embeddings will be returned but logits will not
|
||||||
|
# LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
||||||
|
@ctypes_function("llama_set_embeddings", [llama_context_p_ctypes, ctypes.c_bool], None)
|
||||||
|
def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /):
|
||||||
|
"""Set whether the model is in embeddings model or not
|
||||||
|
If true, embeddings will be returned but logits will not"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
# // Set whether to use causal attention or not
|
# // Set whether to use causal attention or not
|
||||||
# // If set to true, the model will only attend to the past tokens
|
# // If set to true, the model will only attend to the past tokens
|
||||||
# LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
# LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 172c8256840ffd882ab9992ecedbb587d9b21f15
|
Subproject commit 557b653dc9ed91e8c313e87500e0050c775f81b6
|
Loading…
Reference in a new issue