Merge tag 'v0.2.16' into main

This commit is contained in:
Andrei Betlen 2023-11-14 15:32:08 -05:00
commit 3af167d8db
6 changed files with 40 additions and 21 deletions

View file

@ -33,6 +33,9 @@ jobs:
- name: Build wheels - name: Build wheels
run: python -m cibuildwheel --output-dir wheelhouse run: python -m cibuildwheel --output-dir wheelhouse
env:
# disable repair
CIBW_REPAIR_WHEEL_COMMAND: ""
- uses: actions/upload-artifact@v3 - uses: actions/upload-artifact@v3
with: with:

View file

@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
## [0.2.16]
- Update llama.cpp to ggerganov/llama.cp@a75fa576abba9d37f463580c379e4bbf1e1ad03c
- Add `set_seed` to `Llama` class by @abetlen in fd41ed3a908761d286102a019a34c2938a15118d
- Fix server doc arguments by @kjunggithub in #892
- Fix response_format handler in llava chat handler by @abetlen in b62c44983921197ed10a7d29dc4ba920e9979380
- Fix default max_tokens, chat completion is now unlimited (to context length) and completion is 16 tokens to match OpenAI defaults by @abetlen in e7962d2c733cbbeec5a37392c81f64185a9a39e8
- Fix json_schema_to_gbnf helper so that it takes a json schema string as input instead by @abetlen in faeae181b1e868643c0dc28fcf039f077baf0829
- Add support for $ref and $def in json_schema_to_gbnf to handle more complex function schemas by @abetlen in 770df344369c0630df1be14be9f9e301e7c56d24
- Update functionary chat handler for new OpenAI api by abetlen in 1b376c62b775b401653facf25a519d116aafe99a
- Fix add default stop sequence to chatml chat format by @abetlen in b84d76a844149216d511cfd8cdb9827148a1853c
- Fix sampling bug when logits_all=False by @abetlen in 6f0b0b1b840af846938ed74d0e8170a91c40e617
## [0.2.15] ## [0.2.15]
- Update llama.cpp to ggerganov/llama.cpp@0a7c980b6f94a049cb804573df2d8092a34df8e4 - Update llama.cpp to ggerganov/llama.cpp@0a7c980b6f94a049cb804573df2d8092a34df8e4

View file

@ -1,4 +1,4 @@
from .llama_cpp import * from .llama_cpp import *
from .llama import * from .llama import *
__version__ = "0.2.15" __version__ = "0.2.16"

View file

@ -1019,27 +1019,26 @@ class Llama:
""" """
assert self._ctx.ctx is not None assert self._ctx.ctx is not None
assert self._batch.batch is not None assert self._batch.batch is not None
n_ctx = self._n_ctx self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
for i in range(0, len(tokens), self.n_batch): for i in range(0, len(tokens), self.n_batch):
batch = tokens[i : min(len(tokens), i + self.n_batch)] batch = tokens[i : min(len(tokens), i + self.n_batch)]
n_past = min(n_ctx - len(batch), self.n_tokens) n_past = self.n_tokens
n_tokens = len(batch) n_tokens = len(batch)
self._ctx.kv_cache_seq_rm(-1, n_past, -1)
self._batch.set_batch( self._batch.set_batch(
batch=batch, n_past=n_past, logits_all=self.context_params.logits_all batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
) )
self._ctx.decode(self._batch) self._ctx.decode(self._batch)
# Save tokens # Save tokens
self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch self.input_ids[n_past : n_past + n_tokens] = batch
# Save logits # Save logits
rows = n_tokens if self.context_params.logits_all else 1 rows = n_tokens
cols = self._n_vocab cols = self._n_vocab
offset = ( offset = (
0 if self.context_params.logits_all else n_tokens - 1 0 if self.context_params.logits_all else n_tokens - 1
) # NOTE: Only save the last token logits if logits_all is False ) # NOTE: Only save the last token logits if logits_all is False
self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape( self.scores[n_past + offset : n_past + n_tokens, :].reshape(
-1 -1
)[:] = self._ctx.get_logits()[: rows * cols] )[:] = self._ctx.get_logits()[offset * cols: rows * cols]
# Update n_tokens # Update n_tokens
self.n_tokens += n_tokens self.n_tokens += n_tokens

View file

@ -1,4 +1,7 @@
import ctypes
import pytest import pytest
import llama_cpp import llama_cpp
MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama.gguf" MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama.gguf"
@ -36,19 +39,20 @@ def test_llama_cpp_tokenization():
def test_llama_patch(monkeypatch): def test_llama_patch(monkeypatch):
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) n_ctx = 128
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx)
n_vocab = llama_cpp.llama_n_vocab(llama._model.model) n_vocab = llama_cpp.llama_n_vocab(llama._model.model)
assert n_vocab == 32000
## Set up mock function ## Set up mock function
def mock_eval(*args, **kwargs): def mock_decode(*args, **kwargs):
return 0 return 0
def mock_get_logits(*args, **kwargs): def mock_get_logits(*args, **kwargs):
return (llama_cpp.c_float * n_vocab)( size = n_vocab * n_ctx
*[llama_cpp.c_float(0) for _ in range(n_vocab)] return (llama_cpp.c_float * size)()
)
monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_eval) monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_decode)
monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
output_text = " jumps over the lazy dog." output_text = " jumps over the lazy dog."
@ -126,19 +130,19 @@ def test_llama_pickle():
def test_utf8(monkeypatch): def test_utf8(monkeypatch):
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) n_ctx = 512
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx, logits_all=True)
n_vocab = llama.n_vocab() n_vocab = llama.n_vocab()
## Set up mock function ## Set up mock function
def mock_eval(*args, **kwargs): def mock_decode(*args, **kwargs):
return 0 return 0
def mock_get_logits(*args, **kwargs): def mock_get_logits(*args, **kwargs):
return (llama_cpp.c_float * n_vocab)( size = n_vocab * n_ctx
*[llama_cpp.c_float(0) for _ in range(n_vocab)] return (llama_cpp.c_float * size)()
)
monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_eval) monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_decode)
monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
output_text = "😀" output_text = "😀"

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 875fb42871a0f5a88fbe31a0b5edd697b84038e4 Subproject commit a75fa576abba9d37f463580c379e4bbf1e1ad03c