Merge tag 'v0.2.16' into main
This commit is contained in:
commit
3af167d8db
6 changed files with 40 additions and 21 deletions
3
.github/workflows/build-and-release.yaml
vendored
3
.github/workflows/build-and-release.yaml
vendored
|
@ -33,6 +33,9 @@ jobs:
|
|||
|
||||
- name: Build wheels
|
||||
run: python -m cibuildwheel --output-dir wheelhouse
|
||||
env:
|
||||
# disable repair
|
||||
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||
|
||||
- uses: actions/upload-artifact@v3
|
||||
with:
|
||||
|
|
13
CHANGELOG.md
13
CHANGELOG.md
|
@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.2.16]
|
||||
|
||||
- Update llama.cpp to ggerganov/llama.cp@a75fa576abba9d37f463580c379e4bbf1e1ad03c
|
||||
- Add `set_seed` to `Llama` class by @abetlen in fd41ed3a908761d286102a019a34c2938a15118d
|
||||
- Fix server doc arguments by @kjunggithub in #892
|
||||
- Fix response_format handler in llava chat handler by @abetlen in b62c44983921197ed10a7d29dc4ba920e9979380
|
||||
- Fix default max_tokens, chat completion is now unlimited (to context length) and completion is 16 tokens to match OpenAI defaults by @abetlen in e7962d2c733cbbeec5a37392c81f64185a9a39e8
|
||||
- Fix json_schema_to_gbnf helper so that it takes a json schema string as input instead by @abetlen in faeae181b1e868643c0dc28fcf039f077baf0829
|
||||
- Add support for $ref and $def in json_schema_to_gbnf to handle more complex function schemas by @abetlen in 770df344369c0630df1be14be9f9e301e7c56d24
|
||||
- Update functionary chat handler for new OpenAI api by abetlen in 1b376c62b775b401653facf25a519d116aafe99a
|
||||
- Fix add default stop sequence to chatml chat format by @abetlen in b84d76a844149216d511cfd8cdb9827148a1853c
|
||||
- Fix sampling bug when logits_all=False by @abetlen in 6f0b0b1b840af846938ed74d0e8170a91c40e617
|
||||
|
||||
## [0.2.15]
|
||||
|
||||
- Update llama.cpp to ggerganov/llama.cpp@0a7c980b6f94a049cb804573df2d8092a34df8e4
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from .llama_cpp import *
|
||||
from .llama import *
|
||||
|
||||
__version__ = "0.2.15"
|
||||
__version__ = "0.2.16"
|
|
@ -1019,27 +1019,26 @@ class Llama:
|
|||
"""
|
||||
assert self._ctx.ctx is not None
|
||||
assert self._batch.batch is not None
|
||||
n_ctx = self._n_ctx
|
||||
self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
|
||||
for i in range(0, len(tokens), self.n_batch):
|
||||
batch = tokens[i : min(len(tokens), i + self.n_batch)]
|
||||
n_past = min(n_ctx - len(batch), self.n_tokens)
|
||||
n_past = self.n_tokens
|
||||
n_tokens = len(batch)
|
||||
self._ctx.kv_cache_seq_rm(-1, n_past, -1)
|
||||
self._batch.set_batch(
|
||||
batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
|
||||
)
|
||||
self._ctx.decode(self._batch)
|
||||
# Save tokens
|
||||
self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
|
||||
self.input_ids[n_past : n_past + n_tokens] = batch
|
||||
# Save logits
|
||||
rows = n_tokens if self.context_params.logits_all else 1
|
||||
rows = n_tokens
|
||||
cols = self._n_vocab
|
||||
offset = (
|
||||
0 if self.context_params.logits_all else n_tokens - 1
|
||||
) # NOTE: Only save the last token logits if logits_all is False
|
||||
self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape(
|
||||
self.scores[n_past + offset : n_past + n_tokens, :].reshape(
|
||||
-1
|
||||
)[:] = self._ctx.get_logits()[: rows * cols]
|
||||
)[:] = self._ctx.get_logits()[offset * cols: rows * cols]
|
||||
# Update n_tokens
|
||||
self.n_tokens += n_tokens
|
||||
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
import ctypes
|
||||
|
||||
import pytest
|
||||
|
||||
import llama_cpp
|
||||
|
||||
MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama.gguf"
|
||||
|
@ -36,19 +39,20 @@ def test_llama_cpp_tokenization():
|
|||
|
||||
|
||||
def test_llama_patch(monkeypatch):
|
||||
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
|
||||
n_ctx = 128
|
||||
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx)
|
||||
n_vocab = llama_cpp.llama_n_vocab(llama._model.model)
|
||||
assert n_vocab == 32000
|
||||
|
||||
## Set up mock function
|
||||
def mock_eval(*args, **kwargs):
|
||||
def mock_decode(*args, **kwargs):
|
||||
return 0
|
||||
|
||||
def mock_get_logits(*args, **kwargs):
|
||||
return (llama_cpp.c_float * n_vocab)(
|
||||
*[llama_cpp.c_float(0) for _ in range(n_vocab)]
|
||||
)
|
||||
size = n_vocab * n_ctx
|
||||
return (llama_cpp.c_float * size)()
|
||||
|
||||
monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_eval)
|
||||
monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_decode)
|
||||
monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
|
||||
|
||||
output_text = " jumps over the lazy dog."
|
||||
|
@ -126,19 +130,19 @@ def test_llama_pickle():
|
|||
|
||||
|
||||
def test_utf8(monkeypatch):
|
||||
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
|
||||
n_ctx = 512
|
||||
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx, logits_all=True)
|
||||
n_vocab = llama.n_vocab()
|
||||
|
||||
## Set up mock function
|
||||
def mock_eval(*args, **kwargs):
|
||||
def mock_decode(*args, **kwargs):
|
||||
return 0
|
||||
|
||||
def mock_get_logits(*args, **kwargs):
|
||||
return (llama_cpp.c_float * n_vocab)(
|
||||
*[llama_cpp.c_float(0) for _ in range(n_vocab)]
|
||||
)
|
||||
size = n_vocab * n_ctx
|
||||
return (llama_cpp.c_float * size)()
|
||||
|
||||
monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_eval)
|
||||
monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_decode)
|
||||
monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
|
||||
|
||||
output_text = "😀"
|
||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 875fb42871a0f5a88fbe31a0b5edd697b84038e4
|
||||
Subproject commit a75fa576abba9d37f463580c379e4bbf1e1ad03c
|
Loading…
Reference in a new issue