diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 1356d37..61027ef 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -33,6 +33,9 @@ jobs: - name: Build wheels run: python -m cibuildwheel --output-dir wheelhouse + env: + # disable repair + CIBW_REPAIR_WHEEL_COMMAND: "" - uses: actions/upload-artifact@v3 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index de3a03f..e6f0241 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.16] + +- Update llama.cpp to ggerganov/llama.cp@a75fa576abba9d37f463580c379e4bbf1e1ad03c +- Add `set_seed` to `Llama` class by @abetlen in fd41ed3a908761d286102a019a34c2938a15118d +- Fix server doc arguments by @kjunggithub in #892 +- Fix response_format handler in llava chat handler by @abetlen in b62c44983921197ed10a7d29dc4ba920e9979380 +- Fix default max_tokens, chat completion is now unlimited (to context length) and completion is 16 tokens to match OpenAI defaults by @abetlen in e7962d2c733cbbeec5a37392c81f64185a9a39e8 +- Fix json_schema_to_gbnf helper so that it takes a json schema string as input instead by @abetlen in faeae181b1e868643c0dc28fcf039f077baf0829 +- Add support for $ref and $def in json_schema_to_gbnf to handle more complex function schemas by @abetlen in 770df344369c0630df1be14be9f9e301e7c56d24 +- Update functionary chat handler for new OpenAI api by abetlen in 1b376c62b775b401653facf25a519d116aafe99a +- Fix add default stop sequence to chatml chat format by @abetlen in b84d76a844149216d511cfd8cdb9827148a1853c +- Fix sampling bug when logits_all=False by @abetlen in 6f0b0b1b840af846938ed74d0e8170a91c40e617 + ## [0.2.15] - Update llama.cpp to ggerganov/llama.cpp@0a7c980b6f94a049cb804573df2d8092a34df8e4 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 6e64afb..a24e550 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.15" \ No newline at end of file +__version__ = "0.2.16" \ No newline at end of file diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1e78221..2e18b47 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1019,27 +1019,26 @@ class Llama: """ assert self._ctx.ctx is not None assert self._batch.batch is not None - n_ctx = self._n_ctx + self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1) for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] - n_past = min(n_ctx - len(batch), self.n_tokens) + n_past = self.n_tokens n_tokens = len(batch) - self._ctx.kv_cache_seq_rm(-1, n_past, -1) self._batch.set_batch( batch=batch, n_past=n_past, logits_all=self.context_params.logits_all ) self._ctx.decode(self._batch) # Save tokens - self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch + self.input_ids[n_past : n_past + n_tokens] = batch # Save logits - rows = n_tokens if self.context_params.logits_all else 1 + rows = n_tokens cols = self._n_vocab offset = ( 0 if self.context_params.logits_all else n_tokens - 1 ) # NOTE: Only save the last token logits if logits_all is False - self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape( + self.scores[n_past + offset : n_past + n_tokens, :].reshape( -1 - )[:] = self._ctx.get_logits()[: rows * cols] + )[:] = self._ctx.get_logits()[offset * cols: rows * cols] # Update n_tokens self.n_tokens += n_tokens diff --git a/tests/test_llama.py b/tests/test_llama.py index 5448743..23c7e86 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -1,4 +1,7 @@ +import ctypes + import pytest + import llama_cpp MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama.gguf" @@ -36,19 +39,20 @@ def test_llama_cpp_tokenization(): def test_llama_patch(monkeypatch): - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) + n_ctx = 128 + llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx) n_vocab = llama_cpp.llama_n_vocab(llama._model.model) + assert n_vocab == 32000 ## Set up mock function - def mock_eval(*args, **kwargs): + def mock_decode(*args, **kwargs): return 0 def mock_get_logits(*args, **kwargs): - return (llama_cpp.c_float * n_vocab)( - *[llama_cpp.c_float(0) for _ in range(n_vocab)] - ) + size = n_vocab * n_ctx + return (llama_cpp.c_float * size)() - monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_eval) + monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_decode) monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) output_text = " jumps over the lazy dog." @@ -126,19 +130,19 @@ def test_llama_pickle(): def test_utf8(monkeypatch): - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) + n_ctx = 512 + llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx, logits_all=True) n_vocab = llama.n_vocab() ## Set up mock function - def mock_eval(*args, **kwargs): + def mock_decode(*args, **kwargs): return 0 def mock_get_logits(*args, **kwargs): - return (llama_cpp.c_float * n_vocab)( - *[llama_cpp.c_float(0) for _ in range(n_vocab)] - ) + size = n_vocab * n_ctx + return (llama_cpp.c_float * size)() - monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_eval) + monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_decode) monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) output_text = "😀" diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 875fb42..a75fa57 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 875fb42871a0f5a88fbe31a0b5edd697b84038e4 +Subproject commit a75fa576abba9d37f463580c379e4bbf1e1ad03c