From 3bb45f16589cfe3649330f78114237f64c8f5080 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Mon, 10 Apr 2023 16:38:45 +0200 Subject: [PATCH 001/443] More reasonable defaults --- examples/low_level_api/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index f16980c..58a5688 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -50,7 +50,7 @@ class GptParams: # If chat ended prematurely, append this to the conversation to fix it. # Set to "\nUser:" etc. # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:"" - fix_prefix: str = " " + fix_prefix: str = "" output_postfix: str = "" input_echo: bool = True, From ffb1e8025104eea18d5043e7b5cb2d39b2e04340 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 10 Apr 2023 11:37:41 -0400 Subject: [PATCH 002/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 89c8271..2b0b35b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.30" +version = "0.1.31" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 33c2e26..a7c6262 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.30", + version="0.1.31", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 5247e32d9e9ef9d33950da07865c535ae6df988b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 10 Apr 2023 12:56:23 -0400 Subject: [PATCH 003/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 180b693..684da25 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 180b693a47b6b825288ef9f2c39d24b6eea4eea6 +Subproject commit 684da25926e5c505f725b4f10b5485b218fa1fc7 From 3727ba4d9e932dc90eead9b8210fb5498670cbbc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 10 Apr 2023 12:56:48 -0400 Subject: [PATCH 004/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2b0b35b..50fe7e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.31" +version = "0.1.32" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index a7c6262..3ce6001 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.31", + version="0.1.32", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 213cc5c34082490f7aa88d27aa24d5eae2a39ab9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 11 Apr 2023 11:54:31 -0400 Subject: [PATCH 005/443] Remove async from function signature to avoid blocking the server --- llama_cpp/server/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 44ee1f0..80cbe01 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -196,7 +196,7 @@ CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatComplet "/v1/chat/completions", response_model=CreateChatCompletionResponse, ) -async def create_chat_completion( +def create_chat_completion( request: CreateChatCompletionRequest, ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: completion_or_chunks = llama.create_chat_completion( From 9f1e56559434e51268564532e870298fb0e27d80 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 11 Apr 2023 11:59:03 -0400 Subject: [PATCH 006/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 47 +++++++++++++++++++++++++++++++++++------- vendor/llama.cpp | 2 +- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 8a5869c..0f2b4d5 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1,9 +1,21 @@ import sys import os import ctypes -from ctypes import c_int, c_float, c_char_p, c_void_p, c_bool, POINTER, Structure, Array, c_uint8, c_size_t +from ctypes import ( + c_int, + c_float, + c_char_p, + c_void_p, + c_bool, + POINTER, + Structure, + Array, + c_uint8, + c_size_t, +) import pathlib + # Load the library def _load_shared_library(lib_base_name): # Determine the file extension based on the platform @@ -22,10 +34,10 @@ def _load_shared_library(lib_base_name): # for llamacpp) and "llama" (default name for this repo) _lib_paths = [ _base_path / f"lib{lib_base_name}{lib_ext}", - _base_path / f"{lib_base_name}{lib_ext}" + _base_path / f"{lib_base_name}{lib_ext}", ] - if ("LLAMA_CPP_LIB" in os.environ): + if "LLAMA_CPP_LIB" in os.environ: lib_base_name = os.environ["LLAMA_CPP_LIB"] _lib = pathlib.Path(lib_base_name) _base_path = _lib.parent.resolve() @@ -43,7 +55,10 @@ def _load_shared_library(lib_base_name): except Exception as e: raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") - raise FileNotFoundError(f"Shared library with base name '{lib_base_name}' not found") + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found" + ) + # Specify the base name of the shared library to load _lib_base_name = "llama" @@ -95,6 +110,10 @@ class llama_context_params(Structure): llama_context_params_p = POINTER(llama_context_params) +LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0) +LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors # Functions @@ -106,18 +125,23 @@ def llama_context_default_params() -> llama_context_params: _lib.llama_context_default_params.argtypes = [] _lib.llama_context_default_params.restype = llama_context_params + def llama_mmap_supported() -> c_bool: return _lib.llama_mmap_supported() + _lib.llama_mmap_supported.argtypes = [] _lib.llama_mmap_supported.restype = c_bool + def llama_mlock_supported() -> c_bool: return _lib.llama_mlock_supported() + _lib.llama_mlock_supported.argtypes = [] _lib.llama_mlock_supported.restype = c_bool + # Various functions for loading a ggml llama model. # Allocate (almost) all memory needed for the model. # Return NULL on failure @@ -142,42 +166,49 @@ _lib.llama_free.restype = None # TODO: not great API - very likely to change # Returns 0 on success -def llama_model_quantize( - fname_inp: bytes, fname_out: bytes, itype: c_int -) -> c_int: +def llama_model_quantize(fname_inp: bytes, fname_out: bytes, itype: c_int) -> c_int: return _lib.llama_model_quantize(fname_inp, fname_out, itype) _lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int] _lib.llama_model_quantize.restype = c_int + # Returns the KV cache that will contain the context for the # ongoing prediction with the model. def llama_get_kv_cache(ctx: llama_context_p): return _lib.llama_get_kv_cache(ctx) + _lib.llama_get_kv_cache.argtypes = [llama_context_p] _lib.llama_get_kv_cache.restype = POINTER(c_uint8) + # Returns the size of the KV cache def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t: return _lib.llama_get_kv_cache_size(ctx) + _lib.llama_get_kv_cache_size.argtypes = [llama_context_p] _lib.llama_get_kv_cache_size.restype = c_size_t + # Returns the number of tokens in the KV cache def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: return _lib.llama_get_kv_cache_token_count(ctx) + _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] _lib.llama_get_kv_cache_token_count.restype = c_int # Sets the KV cache containing the current context for the model -def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int): +def llama_set_kv_cache( + ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int +): return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count) + _lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int] _lib.llama_set_kv_cache.restype = None diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 684da25..3e6e70d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 684da25926e5c505f725b4f10b5485b218fa1fc7 +Subproject commit 3e6e70d8e8917b5bd14c7c9f9b89a585f1ff0b31 From 2a60eb820f40ada7d0ac83fd3c0f23ef15052a07 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 11 Apr 2023 23:53:46 -0400 Subject: [PATCH 007/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 3e6e70d..8b67998 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 3e6e70d8e8917b5bd14c7c9f9b89a585f1ff0b31 +Subproject commit 8b679987cdce292ff36bd741f6715e4927e26f9b From c14201dc0f7c544232e5f894726f8ca4d92424f7 Mon Sep 17 00:00:00 2001 From: Niek van der Maas Date: Wed, 12 Apr 2023 11:53:39 +0200 Subject: [PATCH 008/443] Add Dockerfile + build workflow --- .github/workflows/publish.yaml | 31 ++++++++++++++++++++++++++++++- Dockerfile | 10 ++++++++++ README.md | 8 ++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 Dockerfile diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 92b6e5b..16a6012 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -28,4 +28,33 @@ jobs: # if: startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@release/v1 with: - password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file + password: ${{ secrets.PYPI_API_TOKEN }} + + docker: + name: Build and push Docker image + runs-on: ubuntu-latest + needs: build-n-publish + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v4 + with: + push: true # push to registry + pull: true # always fetch the latest base images + platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 + tags: ghcr.io/abetlen/llama-cpp-python:latest \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b500a0b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3-buster + +# We need to set the host to 0.0.0.0 to allow outside access +ENV HOST 0.0.0.0 + +# Install the package +RUN pip install llama-cpp-python[server] + +# Run the server +CMD python3 -m llama_cpp.server \ No newline at end of file diff --git a/README.md b/README.md index 2c8c0a5..81ad723 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,14 @@ python3 -m llama_cpp.server Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. +## Docker image + +A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server: + +```bash +docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-vicuna-7b-4bit.bin ghcr.io/abetlen/llama-cpp-python:latest +``` + ## Low-level API The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`. From 9ce8146231d77e9aceb8a0f2c0f2721755640eed Mon Sep 17 00:00:00 2001 From: Niek van der Maas Date: Wed, 12 Apr 2023 11:56:16 +0200 Subject: [PATCH 009/443] More generic model name --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 81ad723..bcb25e3 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server: ```bash -docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-vicuna-7b-4bit.bin ghcr.io/abetlen/llama-cpp-python:latest +docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest ``` ## Low-level API From b3805bb9ccc2a33d68b568cd00e10f89a0f9506b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 12 Apr 2023 14:05:11 -0400 Subject: [PATCH 010/443] Implement logprobs parameter for text completion. Closes #2 --- llama_cpp/llama.py | 125 ++++++++++++++++++++++++++++++----- llama_cpp/server/__main__.py | 2 + 2 files changed, 111 insertions(+), 16 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2d76ec4..3e13776 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -2,6 +2,7 @@ import os import sys import uuid import time +import math import multiprocessing from typing import List, Optional, Union, Generator, Sequence, Iterator from collections import deque @@ -76,6 +77,9 @@ class Llama: ) self.tokens_consumed = 0 self.n_batch = min(n_ctx, n_batch) + self.n_tokens = 0 + self.n_past = 0 + self.all_logits: List[List[float]] = [] # TODO: Use an array instead of a list. self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) @@ -136,6 +140,9 @@ class Llama: [llama_cpp.llama_token(0)] * self.last_n_tokens_size ) self.tokens_consumed = 0 + self.n_tokens = 0 + self.n_past = 0 + self.all_logits = [] def eval(self, tokens: Sequence[llama_cpp.llama_token]): """Evaluate a list of tokens. @@ -147,18 +154,31 @@ class Llama: n_ctx = int(llama_cpp.llama_n_ctx(self.ctx)) for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] - n_past = min(n_ctx - len(batch), self.tokens_consumed) + self.n_past = min(n_ctx - len(batch), self.tokens_consumed) + self.n_tokens = len(batch) return_code = llama_cpp.llama_eval( ctx=self.ctx, tokens=(llama_cpp.llama_token * len(batch))(*batch), - n_tokens=llama_cpp.c_int(len(batch)), - n_past=llama_cpp.c_int(n_past), + n_tokens=llama_cpp.c_int(self.n_tokens), + n_past=llama_cpp.c_int(self.n_past), n_threads=llama_cpp.c_int(self.n_threads), ) if int(return_code) != 0: raise RuntimeError(f"llama_eval returned {return_code}") self.last_n_tokens_data.extend(batch) self.tokens_consumed += len(batch) + if self.params.logits_all: + self.all_logits.extend(self._logits()) + + def _logits(self) -> List[List[float]]: + """Return the logits from the last call to llama_eval.""" + assert self.ctx is not None + n_vocab = llama_cpp.llama_n_vocab(self.ctx) + cols = int(n_vocab) + rows = self.n_tokens if self.params.logits_all else 1 + logits_view = llama_cpp.llama_get_logits(self.ctx) + logits = [[logits_view[i * cols + j] for j in range(cols)] for i in range(rows)] + return logits def sample( self, @@ -327,14 +347,55 @@ class Llama: else: stop_sequences = [] - finish_reason = None - for token in self.generate( - prompt_tokens, - top_k=top_k, - top_p=top_p, - temp=temperature, - repeat_penalty=repeat_penalty, - ): + text_offset = 0 + text_offsets: List[int] = [] + token_logprobs: List[float] = [] + tokens: List[str] = [] + top_logprobs: List[Dict[str, float]] = [] + + self.reset() + self.eval(prompt_tokens) + + if logprobs is not None and self.params.logits_all is False: + raise ValueError( + "logprobs is not supported for models created with logits_all=False" + ) + + if logprobs is not None: + token_strs = [ + self.detokenize([token]).decode("utf-8") for token in prompt_tokens + ] + logprobs_all = [ + [Llama.logit_to_logprob(logit) for logit in row] + for row in self.all_logits + ] + for token, token_str, logprobs_token in zip( + prompt_tokens, token_strs, logprobs_all + ): + text_offsets.append(text_offset) + text_offset += len(token_str) + tokens.append(token_str) + sorted_logprobs = list( + sorted( + zip(logprobs_token, range(len(logprobs_token))), reverse=True + ) + ) + token_logprobs.append(sorted_logprobs[int(token)][0]) + top_logprob = { + self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8"): logprob + for logprob, i in sorted_logprobs[:logprobs] + } + top_logprob.update({token_str: sorted_logprobs[int(token)][0]}) + top_logprobs.append(top_logprob) + + finish_reason = "length" + while True: + token = self.sample( + top_k=top_k, + top_p=top_p, + temp=temperature, + repeat_penalty=repeat_penalty, + ) if token == llama_cpp.llama_token_eos(): text = self.detokenize(completion_tokens) finish_reason = "stop" @@ -377,13 +438,35 @@ class Llama: } ], } + + if logprobs is not None: + # TODO: Confirm wether this should happen before or after + # next eval. + token_str = self.detokenize([token]).decode("utf-8") + text_offsets.append(text_offset) + text_offset += len(token_str) + tokens.append(token_str) + logprobs_token = [ + Llama.logit_to_logprob(logit) for logit in self.all_logits[-1] + ] + sorted_logprobs = list( + sorted( + zip(logprobs_token, range(len(logprobs_token))), reverse=True + ) + ) + token_logprobs.append(sorted_logprobs[int(token)][0]) + top_logprob = { + self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8"): logprob + for logprob, i in sorted_logprobs[:logprobs] + } + top_logprob.update({token_str: logprobs_token[int(token)]}) + top_logprobs.append(top_logprob) + if len(completion_tokens) >= max_tokens: text = self.detokenize(completion_tokens) finish_reason = "length" break - - if finish_reason is None: - finish_reason = "length" + self.eval([token]) if stream: yield { @@ -410,8 +493,14 @@ class Llama: if suffix is not None: text = text + suffix + logprobs_or_none: Optional[CompletionLogprobs] = None if logprobs is not None: - raise NotImplementedError("logprobs not implemented") + logprobs_or_none = { + "tokens": tokens, + "text_offset": text_offsets, + "token_logprobs": token_logprobs, + "top_logprobs": top_logprobs, + } if self.verbose: llama_cpp.llama_print_timings(self.ctx) @@ -425,7 +514,7 @@ class Llama: { "text": text, "index": 0, - "logprobs": None, + "logprobs": logprobs_or_none, "finish_reason": finish_reason, } ], @@ -704,3 +793,7 @@ class Llama: def token_bos() -> llama_cpp.llama_token: """Return the beginning-of-sequence token.""" return llama_cpp.llama_token_bos() + + @staticmethod + def logit_to_logprob(x: float) -> float: + return math.log(1.0 + math.exp(x)) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 80cbe01..49a00b2 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -33,6 +33,7 @@ class Settings(BaseSettings): use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... embedding: bool = True last_n_tokens_size: int = 64 + logits_all: bool = False app = FastAPI( @@ -52,6 +53,7 @@ llama = llama_cpp.Llama( f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, embedding=settings.embedding, + logits_all=settings.logits_all, n_threads=settings.n_threads, n_batch=settings.n_batch, n_ctx=settings.n_ctx, From 6cf58765388b85329769cb78405c5e5ff74dc414 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 12 Apr 2023 14:06:04 -0400 Subject: [PATCH 011/443] Deprecate generate method --- llama_cpp/llama.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 3e13776..69f7680 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -3,6 +3,7 @@ import sys import uuid import time import math +import warnings import multiprocessing from typing import List, Optional, Union, Generator, Sequence, Iterator from collections import deque @@ -239,6 +240,11 @@ class Llama: Yields: The generated tokens. """ + warnings.warn( + "Llama.generate is deprecated and will be removed in v0.2.0", + DeprecationWarning, + stacklevel=2, + ) assert self.ctx is not None self.reset() while True: From 2f9b6490059f57e15b13de71db3cc19381ed33ef Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 12 Apr 2023 14:06:22 -0400 Subject: [PATCH 012/443] Style fix --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 69f7680..45e09d1 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -330,7 +330,7 @@ class Llama: repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, - ) -> Union[Iterator[Completion], Iterator[CompletionChunk],]: + ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None completion_id = f"cmpl-{str(uuid.uuid4())}" created = int(time.time()) From c854c2564b8cd97c87702480c106a86ca1828d31 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 12 Apr 2023 14:07:14 -0400 Subject: [PATCH 013/443] Don't serialize stateful parameters --- llama_cpp/llama.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 45e09d1..c545420 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -763,8 +763,6 @@ class Llama: use_mlock=self.params.use_mlock, embedding=self.params.embedding, last_n_tokens_size=self.last_n_tokens_size, - last_n_tokens_data=self.last_n_tokens_data, - tokens_consumed=self.tokens_consumed, n_batch=self.n_batch, n_threads=self.n_threads, ) @@ -786,9 +784,6 @@ class Llama: last_n_tokens_size=state["last_n_tokens_size"], verbose=state["verbose"], ) - self.last_n_tokens_data = state["last_n_tokens_data"] - self.tokens_consumed = state["tokens_consumed"] - @staticmethod def token_eos() -> llama_cpp.llama_token: From 005c78d26c00ae5d7e10166993909a0e2ff4af8d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 12 Apr 2023 14:29:00 -0400 Subject: [PATCH 014/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 1 + vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 0f2b4d5..811f69a 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -114,6 +114,7 @@ LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0) LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(4) # tok_embeddings.weight and output.weight are F16 # Functions diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8b67998..e7f6997 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8b679987cdce292ff36bd741f6715e4927e26f9b +Subproject commit e7f6997f897a18b6372a6460e25c5f89e1469f1d From 19598ac4e88619b67514b07b2df93e1b9a039df1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 12 Apr 2023 19:07:53 -0400 Subject: [PATCH 015/443] Fix threading bug. Closes #62 --- llama_cpp/server/__main__.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 49a00b2..4360506 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -13,12 +13,13 @@ Then visit http://localhost:8000/docs to see the interactive API docs. """ import os import json +from threading import Lock from typing import List, Optional, Literal, Union, Iterator, Dict from typing_extensions import TypedDict import llama_cpp -from fastapi import FastAPI +from fastapi import Depends, FastAPI from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict from sse_starlette.sse import EventSourceResponse @@ -59,6 +60,13 @@ llama = llama_cpp.Llama( n_ctx=settings.n_ctx, last_n_tokens_size=settings.last_n_tokens_size, ) +llama_lock = Lock() + + +def get_llama(): + with llama_lock: + yield llama + class CreateCompletionRequest(BaseModel): @@ -101,7 +109,7 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) "/v1/completions", response_model=CreateCompletionResponse, ) -def create_completion(request: CreateCompletionRequest): +def create_completion(request: CreateCompletionRequest, llama: llama_cpp.Llama=Depends(get_llama)): if isinstance(request.prompt, list): request.prompt = "".join(request.prompt) @@ -146,7 +154,7 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) "/v1/embeddings", response_model=CreateEmbeddingResponse, ) -def create_embedding(request: CreateEmbeddingRequest): +def create_embedding(request: CreateEmbeddingRequest, llama: llama_cpp.Llama=Depends(get_llama)): return llama.create_embedding(**request.dict(exclude={"model", "user"})) @@ -200,6 +208,7 @@ CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatComplet ) def create_chat_completion( request: CreateChatCompletionRequest, + llama: llama_cpp.Llama=Depends(get_llama), ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: completion_or_chunks = llama.create_chat_completion( **request.dict( From 0daf16defcc353de715e29e6103e4c7a2422ee58 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 12 Apr 2023 19:08:11 -0400 Subject: [PATCH 016/443] Enable logprobs on completion endpoint --- llama_cpp/server/__main__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 4360506..8b9614e 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -118,7 +118,6 @@ def create_completion(request: CreateCompletionRequest, llama: llama_cpp.Llama=D exclude={ "model", "n", - "logprobs", "frequency_penalty", "presence_penalty", "best_of", From 4f5f99ef2ae4aa6a8e8d636e67eb8aca7fc81184 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 12 Apr 2023 22:40:12 -0400 Subject: [PATCH 017/443] Formatting --- llama_cpp/server/__main__.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 8b9614e..c54d91b 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -109,7 +109,9 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) "/v1/completions", response_model=CreateCompletionResponse, ) -def create_completion(request: CreateCompletionRequest, llama: llama_cpp.Llama=Depends(get_llama)): +def create_completion( + request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) +): if isinstance(request.prompt, list): request.prompt = "".join(request.prompt) @@ -153,7 +155,9 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) "/v1/embeddings", response_model=CreateEmbeddingResponse, ) -def create_embedding(request: CreateEmbeddingRequest, llama: llama_cpp.Llama=Depends(get_llama)): +def create_embedding( + request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) +): return llama.create_embedding(**request.dict(exclude={"model", "user"})) @@ -207,7 +211,7 @@ CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatComplet ) def create_chat_completion( request: CreateChatCompletionRequest, - llama: llama_cpp.Llama=Depends(get_llama), + llama: llama_cpp.Llama = Depends(get_llama), ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: completion_or_chunks = llama.create_chat_completion( **request.dict( From 22fa5a621fa2f8249943e0a52dd8c8a21e9baca0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 13 Apr 2023 00:19:55 -0400 Subject: [PATCH 018/443] Revert "Deprecate generate method" This reverts commit 6cf58765388b85329769cb78405c5e5ff74dc414. --- llama_cpp/llama.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c545420..67fefe5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -3,7 +3,6 @@ import sys import uuid import time import math -import warnings import multiprocessing from typing import List, Optional, Union, Generator, Sequence, Iterator from collections import deque @@ -240,11 +239,6 @@ class Llama: Yields: The generated tokens. """ - warnings.warn( - "Llama.generate is deprecated and will be removed in v0.2.0", - DeprecationWarning, - stacklevel=2, - ) assert self.ctx is not None self.reset() while True: From 6595ad84bfd5360ac22e311f91eef3d78bdb65f2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 13 Apr 2023 00:28:00 -0400 Subject: [PATCH 019/443] Add field to disable reseting between generations --- llama_cpp/llama.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 67fefe5..db9a337 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -218,6 +218,7 @@ class Llama: top_p: float, temp: float, repeat_penalty: float, + reset: bool = True, ) -> Generator[ llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None ]: @@ -235,12 +236,14 @@ class Llama: top_p: The top-p sampling parameter. temp: The temperature parameter. repeat_penalty: The repeat penalty parameter. + reset: Whether to reset the model state. Yields: The generated tokens. """ assert self.ctx is not None - self.reset() + if reset: + self.reset() while True: self.eval(tokens) token = self.sample( From 7dc0838fff5954957f4f0b585831ff8c6732d370 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 13 Apr 2023 00:35:05 -0400 Subject: [PATCH 020/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 50fe7e7..a0b6df3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.32" +version = "0.1.33" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 3ce6001..1648f64 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.32", + version="0.1.33", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 26cc4ee029704976db08a5c67ab812200fcf2c9e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 14 Apr 2023 09:59:08 -0400 Subject: [PATCH 021/443] Fix signature for stop parameter --- llama_cpp/llama.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index db9a337..ae25137 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -323,7 +323,7 @@ class Llama: top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, - stop: List[str] = [], + stop: Optional[List[str]] = [], repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, @@ -336,6 +336,7 @@ class Llama: prompt_tokens = self.tokenize(b" " + prompt.encode("utf-8")) text = b"" returned_characters = 0 + stop = stop if not None else [] if self.verbose: llama_cpp.llama_reset_timings(self.ctx) @@ -537,7 +538,7 @@ class Llama: top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, - stop: List[str] = [], + stop: Optional[List[str]] = [], repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, @@ -592,7 +593,7 @@ class Llama: top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, - stop: List[str] = [], + stop: Optional[List[str]] = [], repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, @@ -698,7 +699,7 @@ class Llama: top_p: float = 0.95, top_k: int = 40, stream: bool = False, - stop: List[str] = [], + stop: Optional[List[str]] = [], max_tokens: int = 128, repeat_penalty: float = 1.1, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: @@ -717,6 +718,7 @@ class Llama: Returns: Generated chat completion or a stream of chat completion chunks. """ + stop = stop if not None else [] instructions = """Complete the following chat conversation between the user and the assistant. System messages should be strictly followed as additional instructions.""" chat_history = "\n".join( f'{message["role"]} {message.get("user", "")}: {message["content"]}' From 6153baab2d2ac7a2c6ce9caa60474d84cf78dca6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 14 Apr 2023 09:59:33 -0400 Subject: [PATCH 022/443] Clean up logprobs implementation --- llama_cpp/llama.py | 106 +++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 67 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ae25137..ecfd2f4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -351,55 +351,19 @@ class Llama: else: stop_sequences = [] - text_offset = 0 - text_offsets: List[int] = [] - token_logprobs: List[float] = [] - tokens: List[str] = [] - top_logprobs: List[Dict[str, float]] = [] - - self.reset() - self.eval(prompt_tokens) - if logprobs is not None and self.params.logits_all is False: raise ValueError( "logprobs is not supported for models created with logits_all=False" ) - if logprobs is not None: - token_strs = [ - self.detokenize([token]).decode("utf-8") for token in prompt_tokens - ] - logprobs_all = [ - [Llama.logit_to_logprob(logit) for logit in row] - for row in self.all_logits - ] - for token, token_str, logprobs_token in zip( - prompt_tokens, token_strs, logprobs_all - ): - text_offsets.append(text_offset) - text_offset += len(token_str) - tokens.append(token_str) - sorted_logprobs = list( - sorted( - zip(logprobs_token, range(len(logprobs_token))), reverse=True - ) - ) - token_logprobs.append(sorted_logprobs[int(token)][0]) - top_logprob = { - self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8"): logprob - for logprob, i in sorted_logprobs[:logprobs] - } - top_logprob.update({token_str: sorted_logprobs[int(token)][0]}) - top_logprobs.append(top_logprob) - finish_reason = "length" - while True: - token = self.sample( - top_k=top_k, - top_p=top_p, - temp=temperature, - repeat_penalty=repeat_penalty, - ) + for token in self.generate( + prompt_tokens, + top_k=top_k, + top_p=top_p, + temp=temperature, + repeat_penalty=repeat_penalty, + ): if token == llama_cpp.llama_token_eos(): text = self.detokenize(completion_tokens) finish_reason = "stop" @@ -443,34 +407,10 @@ class Llama: ], } - if logprobs is not None: - # TODO: Confirm wether this should happen before or after - # next eval. - token_str = self.detokenize([token]).decode("utf-8") - text_offsets.append(text_offset) - text_offset += len(token_str) - tokens.append(token_str) - logprobs_token = [ - Llama.logit_to_logprob(logit) for logit in self.all_logits[-1] - ] - sorted_logprobs = list( - sorted( - zip(logprobs_token, range(len(logprobs_token))), reverse=True - ) - ) - token_logprobs.append(sorted_logprobs[int(token)][0]) - top_logprob = { - self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8"): logprob - for logprob, i in sorted_logprobs[:logprobs] - } - top_logprob.update({token_str: logprobs_token[int(token)]}) - top_logprobs.append(top_logprob) - if len(completion_tokens) >= max_tokens: text = self.detokenize(completion_tokens) finish_reason = "length" break - self.eval([token]) if stream: yield { @@ -499,6 +439,38 @@ class Llama: logprobs_or_none: Optional[CompletionLogprobs] = None if logprobs is not None: + text_offset = 0 + text_offsets: List[int] = [] + token_logprobs: List[float] = [] + tokens: List[str] = [] + top_logprobs: List[Dict[str, float]] = [] + + all_tokens = prompt_tokens + completion_tokens + all_token_strs = [ + self.detokenize([token]).decode("utf-8") for token in all_tokens + ] + all_logprobs = [ + [Llama.logit_to_logprob(logit) for logit in row] + for row in self.all_logits + ] + for token, token_str, logprobs_token in zip( + all_tokens, all_token_strs, all_logprobs + ): + text_offsets.append(text_offset) + text_offset += len(token_str) + tokens.append(token_str) + sorted_logprobs = list( + sorted( + zip(logprobs_token, range(len(logprobs_token))), reverse=True + ) + ) + token_logprobs.append(sorted_logprobs[int(token)][0]) + top_logprob = { + self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8"): logprob + for logprob, i in sorted_logprobs[:logprobs] + } + top_logprob.update({token_str: sorted_logprobs[int(token)][0]}) + top_logprobs.append(top_logprob) logprobs_or_none = { "tokens": tokens, "text_offset": text_offsets, From 6c7cec0c65373d2892dbb23581af27ab407669d9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 14 Apr 2023 10:01:15 -0400 Subject: [PATCH 023/443] Fix completion request --- llama_cpp/server/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index c54d91b..7fc3c57 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -76,7 +76,7 @@ class CreateCompletionRequest(BaseModel): temperature: float = 0.8 top_p: float = 0.95 echo: bool = False - stop: List[str] = [] + stop: Optional[List[str]] = [] stream: bool = False # ignored or currently unsupported @@ -173,7 +173,7 @@ class CreateChatCompletionRequest(BaseModel): temperature: float = 0.8 top_p: float = 0.95 stream: bool = False - stop: List[str] = [] + stop: Optional[List[str]] = [] max_tokens: int = 128 # ignored or currently unsupported From 9c8c2c37dce2326e3272beabd0c6460a4a4a9a3f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 14 Apr 2023 10:01:57 -0400 Subject: [PATCH 024/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e7f6997..a32f7ac 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e7f6997f897a18b6372a6460e25c5f89e1469f1d +Subproject commit a32f7acc9f54dba1c728cb1e596bd00bf3b4eb5f From 6e298d8fca1ee5f25239e54aa5f3eed2eee4651e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 14 Apr 2023 22:21:19 -0400 Subject: [PATCH 025/443] Set kv cache size to f16 by default --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ecfd2f4..cd737c5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -21,7 +21,7 @@ class Llama: n_ctx: int = 512, n_parts: int = -1, seed: int = 1337, - f16_kv: bool = False, + f16_kv: bool = True, logits_all: bool = False, vocab_only: bool = False, use_mmap: bool = True, From 25b646c2fb1e510bf9133f1ee379cf778e99df6f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 14 Apr 2023 23:32:05 -0400 Subject: [PATCH 026/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index a32f7ac..c85e03d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit a32f7acc9f54dba1c728cb1e596bd00bf3b4eb5f +Subproject commit c85e03d12e4b8af22cb13aa9c618dcd5935862fd From ac7068a4699f3a45c555072a9698d9de497aa88c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 14 Apr 2023 23:33:00 -0400 Subject: [PATCH 027/443] Track generated tokens internally --- llama_cpp/llama.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index cd737c5..3ff94a6 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -76,6 +76,7 @@ class Llama: maxlen=self.last_n_tokens_size, ) self.tokens_consumed = 0 + self.tokens: List[llama_cpp.llama_token] = [] self.n_batch = min(n_ctx, n_batch) self.n_tokens = 0 self.n_past = 0 @@ -140,6 +141,7 @@ class Llama: [llama_cpp.llama_token(0)] * self.last_n_tokens_size ) self.tokens_consumed = 0 + self.tokens.clear() self.n_tokens = 0 self.n_past = 0 self.all_logits = [] @@ -165,6 +167,7 @@ class Llama: ) if int(return_code) != 0: raise RuntimeError(f"llama_eval returned {return_code}") + self.tokens.extend(batch) self.last_n_tokens_data.extend(batch) self.tokens_consumed += len(batch) if self.params.logits_all: From e90e122f2a6970edb7d10d1b95e5e97932ef8e18 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 14 Apr 2023 23:33:18 -0400 Subject: [PATCH 028/443] Use clear --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 3ff94a6..93c6288 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -144,7 +144,7 @@ class Llama: self.tokens.clear() self.n_tokens = 0 self.n_past = 0 - self.all_logits = [] + self.all_logits.clear() def eval(self, tokens: Sequence[llama_cpp.llama_token]): """Evaluate a list of tokens. From d7de0e8014d9b18cd7e1ede07c3ea786a532767e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Apr 2023 00:08:04 -0400 Subject: [PATCH 029/443] Bugfix --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 93c6288..0754a8d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -339,7 +339,7 @@ class Llama: prompt_tokens = self.tokenize(b" " + prompt.encode("utf-8")) text = b"" returned_characters = 0 - stop = stop if not None else [] + stop = stop if stop is not None else [] if self.verbose: llama_cpp.llama_reset_timings(self.ctx) From 3cd67c7bd730a721dcea915042ad8568afe76111 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Apr 2023 11:39:21 -0400 Subject: [PATCH 030/443] Add type annotations --- llama_cpp/llama.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 0754a8d..54a2f4a 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -332,13 +332,15 @@ class Llama: stream: bool = False, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None - completion_id = f"cmpl-{str(uuid.uuid4())}" - created = int(time.time()) + completion_id: str = f"cmpl-{str(uuid.uuid4())}" + created: int = int(time.time()) completion_tokens: List[llama_cpp.llama_token] = [] # Add blank space to start of prompt to match OG llama tokenizer - prompt_tokens = self.tokenize(b" " + prompt.encode("utf-8")) - text = b"" - returned_characters = 0 + prompt_tokens: List[llama_cpp.llama_token] = self.tokenize( + b" " + prompt.encode("utf-8") + ) + text: bytes = b"" + returned_characters: int = 0 stop = stop if stop is not None else [] if self.verbose: From 02f9fb82fbfe8b33dba3f39a81625837dca34e02 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Apr 2023 11:39:52 -0400 Subject: [PATCH 031/443] Bugfix --- llama_cpp/llama.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 54a2f4a..e570236 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -695,10 +695,7 @@ class Llama: Returns: Generated chat completion or a stream of chat completion chunks. """ - stop = stop if not None else [] - instructions = """Complete the following chat conversation between the user and the assistant. System messages should be strictly followed as additional instructions.""" - chat_history = "\n".join( - f'{message["role"]} {message.get("user", "")}: {message["content"]}' + stop = stop if stop is not None else [] for message in messages ) PROMPT = f" \n\n### Instructions:{instructions}\n\n### Inputs:{chat_history}\n\n### Response:\nassistant: " From 62087514c641d2ee93b1797df3388af6d60f8c6d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Apr 2023 11:58:19 -0400 Subject: [PATCH 032/443] Update chat prompt --- llama_cpp/llama.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index e570236..578dcb6 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -696,10 +696,12 @@ class Llama: Generated chat completion or a stream of chat completion chunks. """ stop = stop if stop is not None else [] + chat_history = "".join( + f'### {"Human" if message["role"] == "user" else "Assistant"}:{message["content"]}' for message in messages ) - PROMPT = f" \n\n### Instructions:{instructions}\n\n### Inputs:{chat_history}\n\n### Response:\nassistant: " - PROMPT_STOP = ["###", "\nuser: ", "\nassistant: ", "\nsystem: "] + PROMPT = chat_history + "### Assistant:" + PROMPT_STOP = ["### Assistant:", "### Human:", "\n"] completion_or_chunks = self( prompt=PROMPT, stop=PROMPT_STOP + stop, From 83b2be6dc4e88154a72f221420823702bae6a1bc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Apr 2023 11:58:43 -0400 Subject: [PATCH 033/443] Update chat parameters --- llama_cpp/llama.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 578dcb6..63c7b53 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -672,12 +672,12 @@ class Llama: def create_chat_completion( self, messages: List[ChatCompletionMessage], - temperature: float = 0.8, + temperature: float = 0.2, top_p: float = 0.95, top_k: int = 40, stream: bool = False, stop: Optional[List[str]] = [], - max_tokens: int = 128, + max_tokens: int = 256, repeat_penalty: float = 1.1, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. From a6372a7ae5c32cdae7cded800dd988cd12b828fd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Apr 2023 12:02:48 -0400 Subject: [PATCH 034/443] Update stop sequences for chat --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 63c7b53..121f91d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -701,7 +701,7 @@ class Llama: for message in messages ) PROMPT = chat_history + "### Assistant:" - PROMPT_STOP = ["### Assistant:", "### Human:", "\n"] + PROMPT_STOP = ["### Assistant:", "### Human:"] completion_or_chunks = self( prompt=PROMPT, stop=PROMPT_STOP + stop, From 92c077136d1f0b029f8907a79eae009a750005e2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Apr 2023 12:03:09 -0400 Subject: [PATCH 035/443] Add experimental cache --- llama_cpp/llama.py | 69 +++++++++++++++++++++++++++++++++--- llama_cpp/server/__main__.py | 5 ++- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 121f91d..b92801c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -11,6 +11,15 @@ from . import llama_cpp from .llama_types import * +class LlamaCache: + """Cache for a llama.cpp model. + + NOTE: This implementation currently only tells the Llama class to avoid reprocessing bytes and continue from the last + completion. It does not actually cache the results.""" + + pass + + class Llama: """High-level Python wrapper for a llama.cpp model.""" @@ -82,6 +91,14 @@ class Llama: self.n_past = 0 self.all_logits: List[List[float]] = [] # TODO: Use an array instead of a list. + ### HACK: This is a hack to work around the fact that the llama.cpp API does not yet support + ### saving and restoring state, this allows us to continue a completion if the last + ### completion_bytes is a prefix to the prompt passed in. However this is actually incorrect + ### because it does not take into account stop tokens which have been processed by the model. + self._completion_bytes: List[bytes] = [] + self._cache: Optional[LlamaCache] = None + ### + self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) if not os.path.exists(model_path): @@ -135,6 +152,14 @@ class Llama: output += llama_cpp.llama_token_to_str(self.ctx, token) return output + def set_cache(self, cache: Optional[LlamaCache]): + """Set the cache. + + Args: + cache: The cache to set. + """ + self._cache = cache + def reset(self): """Reset the model state.""" self.last_n_tokens_data.extend( @@ -245,6 +270,17 @@ class Llama: The generated tokens. """ assert self.ctx is not None + ### HACK + if ( + reset + and self._cache + and len(self.tokens) > 0 + and self.tokens == tokens[: len(self.tokens)] + ): + if self.verbose: + print("generate cache hit", file=sys.stderr) + reset = False + ### if reset: self.reset() while True: @@ -361,6 +397,21 @@ class Llama: "logprobs is not supported for models created with logits_all=False" ) + ### HACK + reset: bool = True + _prompt: bytes = prompt.encode("utf-8") + _completion: bytes = b"".join(self._completion_bytes) + if len(_completion) and self._cache and _prompt.startswith(_completion): + if self.verbose: + print("completion cache hit", file=sys.stderr) + reset = False + _prompt = _prompt[len(_completion) :] + prompt_tokens = self.tokenize(b" " + _prompt) + self._completion_bytes.append(_prompt) + else: + self._completion_bytes = [prompt.encode("utf-8")] + ### + finish_reason = "length" for token in self.generate( prompt_tokens, @@ -368,6 +419,7 @@ class Llama: top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty, + reset=reset, ): if token == llama_cpp.llama_token_eos(): text = self.detokenize(completion_tokens) @@ -397,6 +449,9 @@ class Llama: break text = all_text[: len(all_text) - longest] returned_characters += len(text[start:]) + ### HACK + self._completion_bytes.append(text[start:]) + ### yield { "id": completion_id, "object": "text_completion", @@ -418,6 +473,9 @@ class Llama: break if stream: + ### HACK + self._completion_bytes.append(text[returned_characters:]) + ### yield { "id": completion_id, "object": "text_completion", @@ -434,13 +492,16 @@ class Llama: } return - text = text.decode("utf-8") + ### HACK + self._completion_bytes.append(text) + ### + text_str = text.decode("utf-8") if echo: - text = prompt + text + text_str = prompt + text_str if suffix is not None: - text = text + suffix + text_str = text_str + suffix logprobs_or_none: Optional[CompletionLogprobs] = None if logprobs is not None: @@ -493,7 +554,7 @@ class Llama: "model": self.model_path, "choices": [ { - "text": text, + "text": text_str, "index": 0, "logprobs": logprobs_or_none, "finish_reason": finish_reason, diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 7fc3c57..48481c6 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -35,6 +35,7 @@ class Settings(BaseSettings): embedding: bool = True last_n_tokens_size: int = 64 logits_all: bool = False + cache: bool = False # WARNING: This is an experimental feature app = FastAPI( @@ -60,6 +61,9 @@ llama = llama_cpp.Llama( n_ctx=settings.n_ctx, last_n_tokens_size=settings.last_n_tokens_size, ) +if settings.cache: + cache = llama_cpp.LlamaCache() + llama.set_cache(cache) llama_lock = Lock() @@ -68,7 +72,6 @@ def get_llama(): yield llama - class CreateCompletionRequest(BaseModel): prompt: Union[str, List[str]] suffix: Optional[str] = Field(None) From 887f3b73ac16976d63c699adcb399ad63054ee74 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Apr 2023 12:16:05 -0400 Subject: [PATCH 036/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c85e03d..e95b655 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c85e03d12e4b8af22cb13aa9c618dcd5935862fd +Subproject commit e95b6554b493e71a0275764342e09bd5784a7026 From 59b37bbbd2fb2a69788c5de6bd103439befbc845 Mon Sep 17 00:00:00 2001 From: Niek van der Maas Date: Sat, 15 Apr 2023 20:24:46 +0200 Subject: [PATCH 037/443] Support openblas --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index b500a0b..5bd28b7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ FROM python:3-buster ENV HOST 0.0.0.0 # Install the package -RUN pip install llama-cpp-python[server] +RUN apt update && apt install -y libopenblas-dev && LLAMA_OPENBLAS=1 pip install llama-cpp-python[server] # Run the server CMD python3 -m llama_cpp.server \ No newline at end of file From 89856ef00d377d0b63ce91fb3c5d184dcbfa9124 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Apr 2023 17:32:53 -0400 Subject: [PATCH 038/443] Bugfix: only eval new tokens --- llama_cpp/llama.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index b92801c..edd2eef 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -280,6 +280,7 @@ class Llama: if self.verbose: print("generate cache hit", file=sys.stderr) reset = False + tokens = tokens[len(self.tokens) :] ### if reset: self.reset() From e38485a66d0a92100815cccba3ba81439debdb6c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Apr 2023 20:27:55 -0400 Subject: [PATCH 039/443] Bump version. --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a0b6df3..aeb5579 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.33" +version = "0.1.34" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 1648f64..b0ff844 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.33", + version="0.1.34", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From b2a24bddacc7b10d1ba8a0dff1d8b5fae9bfbad3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Apr 2023 22:31:14 -0400 Subject: [PATCH 040/443] Update docs --- docs/index.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/index.md b/docs/index.md index 4055155..5424e26 100644 --- a/docs/index.md +++ b/docs/index.md @@ -104,10 +104,13 @@ python3 setup.py develop - create_completion - __call__ - create_chat_completion + - set_cache - token_bos - token_eos show_root_heading: true +::: llama_cpp.LlamaCache + ::: llama_cpp.llama_cpp options: show_if_no_docstring: true From 53d17ad0033e99d9ac5c3fb4855710383fb1f202 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Mon, 17 Apr 2023 14:45:28 +0200 Subject: [PATCH 041/443] Fixed end of text wrong type, and fix n_predict behaviour --- examples/low_level_api/common.py | 2 +- examples/low_level_api/low_level_api_chat_cpp.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 58a5688..061ec3a 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -75,7 +75,7 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None): parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") - parser.add_argument("--n_predict", type=int, default=128, help="number of model parts",dest="n_predict") + parser.add_argument("--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict") parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch") diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index a61a55e..d64ee8f 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -144,6 +144,7 @@ specified) expect poor results""", file=sys.stderr) # determine newline token self.llama_token_newline = self._tokenize("\n", False) + self.llama_token_eot = self._tokenize(" [end of text]\n", False) if (self.params.verbose_prompt): print(f""" @@ -203,16 +204,16 @@ n_keep = {self.params.n_keep} _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos) return _arr[:_n] - def use_antiprompt(self): - return len(self.first_antiprompt) > 0 - def set_color(self, c): if (self.params.use_color): print(c, end="") + def use_antiprompt(self): + return len(self.first_antiprompt) > 0 + # generate tokens def generate(self): - while self.remaining_tokens > 0 or self.params.interactive: + while self.remaining_tokens > 0 or self.params.interactive or self.params.n_predict == -1: # predict if len(self.embd) > 0: # infinite text generation via context swapping @@ -313,7 +314,7 @@ n_keep = {self.params.n_keep} # end of text token if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(): if (not self.params.instruct): - for i in " [end of text]\n": + for i in self.llama_token_eot: yield i break From 3f68e950976eca364943d80ca515b0196ef5be13 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 18 Apr 2023 01:29:27 -0400 Subject: [PATCH 042/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e95b655..315a95a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e95b6554b493e71a0275764342e09bd5784a7026 +Subproject commit 315a95a4d30db726fb7d244dd3b9e90a83fb1616 From 35abf89552c6167cec6b110fc6981585970147cd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 18 Apr 2023 01:30:04 -0400 Subject: [PATCH 043/443] Add bindings for LoRA adapters. Closes #88 --- llama_cpp/llama_cpp.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 811f69a..cad9030 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -114,7 +114,9 @@ LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0) LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(4) # tok_embeddings.weight and output.weight are F16 +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( + 4 +) # tok_embeddings.weight and output.weight are F16 # Functions @@ -175,6 +177,22 @@ _lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int] _lib.llama_model_quantize.restype = c_int +# Apply a LoRA adapter to a loaded model +# path_base_model is the path to a higher quality model to use as a base for +# the layers modified by the adapter. Can be NULL to use the current loaded model. +# The model needs to be reloaded before applying a new adapter, otherwise the adapter +# will be applied on top of the previous one +# Returns 0 on success +def llama_apply_lora_from_file( + ctx: llama_context_p, path_lora: bytes, path_base_model: bytes, n_threads: c_int +) -> c_int: + return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) + + +_lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int] +_lib.llama_apply_lora_from_file.restype = c_int + + # Returns the KV cache that will contain the context for the # ongoing prediction with the model. def llama_get_kv_cache(ctx: llama_context_p): From eb7f278cc645ad85ad641713d423bc8193016fd2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 18 Apr 2023 01:43:44 -0400 Subject: [PATCH 044/443] Add lora_path parameter to Llama model --- llama_cpp/llama.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index edd2eef..931d0ff 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -39,6 +39,7 @@ class Llama: n_threads: Optional[int] = None, n_batch: int = 8, last_n_tokens_size: int = 64, + lora_path: Optional[str] = None, verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -57,6 +58,7 @@ class Llama: n_threads: Number of threads to use. If None, the number of threads is automatically determined. n_batch: Maximum number of prompt tokens to batch together when calling llama_eval. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. + lora_path: Path to a LoRA file to apply to the model. verbose: Print verbose output to stderr. Raises: @@ -108,6 +110,17 @@ class Llama: self.model_path.encode("utf-8"), self.params ) + self.lora_path = None + if lora_path: + self.lora_path = lora_path + if llama_cpp.llama_apply_lora_from_file( + self.ctx, + self.lora_path.encode("utf-8"), + self.model_path.encode("utf-8"), + llama_cpp.c_int(self.n_threads), + ): + raise RuntimeError(f"Failed to apply LoRA from path: {self.lora_path}") + if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) @@ -802,6 +815,7 @@ class Llama: last_n_tokens_size=self.last_n_tokens_size, n_batch=self.n_batch, n_threads=self.n_threads, + lora_path=self.lora_path, ) def __setstate__(self, state): @@ -819,6 +833,7 @@ class Llama: n_threads=state["n_threads"], n_batch=state["n_batch"], last_n_tokens_size=state["last_n_tokens_size"], + lora_path=state["lora_path"], verbose=state["verbose"], ) From b2d44aa6339c24ccdcda6922371cafb10ce92d40 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 18 Apr 2023 02:22:35 -0400 Subject: [PATCH 045/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 315a95a..4274722 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 315a95a4d30db726fb7d244dd3b9e90a83fb1616 +Subproject commit 42747220b4cac548b6e3059b66b3e960b517cfa4 From 453e517fd54c5f2a882199629beb0f01002e0b40 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 18 Apr 2023 10:20:46 -0400 Subject: [PATCH 046/443] Add seperate lora_base path for applying LoRA to quantized models using original unquantized model weights. --- llama_cpp/llama.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 931d0ff..5f09e4d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -39,6 +39,7 @@ class Llama: n_threads: Optional[int] = None, n_batch: int = 8, last_n_tokens_size: int = 64, + lora_base: Optional[str] = None, lora_path: Optional[str] = None, verbose: bool = True, ): @@ -58,6 +59,7 @@ class Llama: n_threads: Number of threads to use. If None, the number of threads is automatically determined. n_batch: Maximum number of prompt tokens to batch together when calling llama_eval. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. + lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. verbose: Print verbose output to stderr. @@ -110,16 +112,21 @@ class Llama: self.model_path.encode("utf-8"), self.params ) + self.lora_base = None self.lora_path = None if lora_path: + self.lora_base = lora_base + # Use lora_base if set otherwise revert to using model_path. + lora_base = lora_base if lora_base is not None else model_path + self.lora_path = lora_path if llama_cpp.llama_apply_lora_from_file( self.ctx, - self.lora_path.encode("utf-8"), - self.model_path.encode("utf-8"), + lora_path.encode("utf-8"), + lora_base.encode("utf-8"), llama_cpp.c_int(self.n_threads), ): - raise RuntimeError(f"Failed to apply LoRA from path: {self.lora_path}") + raise RuntimeError(f"Failed to apply LoRA from lora path: {lora_path} to base path: {lora_base}") if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) @@ -815,6 +822,7 @@ class Llama: last_n_tokens_size=self.last_n_tokens_size, n_batch=self.n_batch, n_threads=self.n_threads, + lora_base=self.lora_base, lora_path=self.lora_path, ) @@ -833,6 +841,7 @@ class Llama: n_threads=state["n_threads"], n_batch=state["n_batch"], last_n_tokens_size=state["last_n_tokens_size"], + lora_base=state["lora_base"], lora_path=state["lora_path"], verbose=state["verbose"], ) From 95c0dc134ebef73af5c3b88b246085dcd8e51492 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 18 Apr 2023 23:44:46 -0400 Subject: [PATCH 047/443] Update type signature to allow for null pointer to be passed. --- llama_cpp/llama_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index cad9030..78d8e1f 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -184,7 +184,7 @@ _lib.llama_model_quantize.restype = c_int # will be applied on top of the previous one # Returns 0 on success def llama_apply_lora_from_file( - ctx: llama_context_p, path_lora: bytes, path_base_model: bytes, n_threads: c_int + ctx: llama_context_p, path_lora: ctypes.c_char_p, path_base_model: ctypes.c_char_p, n_threads: c_int ) -> c_int: return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) From 0df4d69c205a9b9ca509854ae9840598b894259b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 18 Apr 2023 23:45:25 -0400 Subject: [PATCH 048/443] If lora base is not set avoid re-loading the model by passing NULL --- llama_cpp/llama.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5f09e4d..ea9f0ff 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -112,21 +112,20 @@ class Llama: self.model_path.encode("utf-8"), self.params ) - self.lora_base = None - self.lora_path = None - if lora_path: - self.lora_base = lora_base - # Use lora_base if set otherwise revert to using model_path. - lora_base = lora_base if lora_base is not None else model_path - - self.lora_path = lora_path + self.lora_base = lora_base + self.lora_path = lora_path + if self.lora_path: if llama_cpp.llama_apply_lora_from_file( self.ctx, - lora_path.encode("utf-8"), - lora_base.encode("utf-8"), + llama_cpp.c_char_p(self.lora_path.encode("utf-8")), + llama_cpp.c_char_p(self.lora_base.encode("utf-8")) + if self.lora_base is not None + else llama_cpp.c_char_p(0), llama_cpp.c_int(self.n_threads), ): - raise RuntimeError(f"Failed to apply LoRA from lora path: {lora_path} to base path: {lora_base}") + raise RuntimeError( + f"Failed to apply LoRA from lora path: {self.lora_path} to base path: {self.lora_base}" + ) if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) From 207ebbc8dc2e1fe94fa329ac5393d093d30f449e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 19 Apr 2023 14:02:11 -0400 Subject: [PATCH 049/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4274722..884e7d7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 42747220b4cac548b6e3059b66b3e960b517cfa4 +Subproject commit 884e7d7a2bfd7325b107442d6758983f5886ed3d From e4647c75ec49e21fa2146844c6b91faba58c6699 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 19 Apr 2023 15:57:46 -0400 Subject: [PATCH 050/443] Add use_mmap flag to server --- llama_cpp/server/__main__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 48481c6..b2ec4de 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -29,9 +29,10 @@ class Settings(BaseSettings): model: str n_ctx: int = 2048 n_batch: int = 8 - n_threads: int = ((os.cpu_count() or 2) // 2) or 1 + n_threads: int = max((os.cpu_count() or 2) // 2, 1) f16_kv: bool = True use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... + use_mmap: bool = True embedding: bool = True last_n_tokens_size: int = 64 logits_all: bool = False @@ -54,6 +55,7 @@ llama = llama_cpp.Llama( settings.model, f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, + use_mmap=settings.use_mmap, embedding=settings.embedding, logits_all=settings.logits_all, n_threads=settings.n_threads, From 3d290623f5473929aaa59d80f9cb19d1c84c32bc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 20 Apr 2023 01:08:15 -0400 Subject: [PATCH 051/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 884e7d7..02d6988 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 884e7d7a2bfd7325b107442d6758983f5886ed3d +Subproject commit 02d6988121510c067e06d498a273a351a888f5b9 From 207adbdf1319302bb7bd6e137ca34d4605b36ec0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 20 Apr 2023 01:48:24 -0400 Subject: [PATCH 052/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index aeb5579..80a354c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.34" +version = "0.1.35" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index b0ff844..3a7ee1c 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.34", + version="0.1.35", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From ba3959eafd38080f3bf3028746406f350a8ef793 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 20 Apr 2023 05:15:31 -0400 Subject: [PATCH 053/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 02d6988..c8c2c52 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 02d6988121510c067e06d498a273a351a888f5b9 +Subproject commit c8c2c524827be8fd681a63f0e5a697b0bf4c587b From 1eb130a6b2445f4f9a41424362a64c26f3424529 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 21 Apr 2023 17:40:27 -0400 Subject: [PATCH 054/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 9 ++++++--- vendor/llama.cpp | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 78d8e1f..97c6565 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -117,6 +117,8 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( 4 ) # tok_embeddings.weight and output.weight are F16 +LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors +LLAMA_FTYPE_MOSTYL_Q4_3 = ctypes.c_int(6) # except 1d tensors # Functions @@ -169,11 +171,12 @@ _lib.llama_free.restype = None # TODO: not great API - very likely to change # Returns 0 on success -def llama_model_quantize(fname_inp: bytes, fname_out: bytes, itype: c_int) -> c_int: - return _lib.llama_model_quantize(fname_inp, fname_out, itype) +# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given +def llama_model_quantize(fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int) -> c_int: + return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread) -_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int] +_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] _lib.llama_model_quantize.restype = c_int diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c8c2c52..50cb666 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c8c2c524827be8fd681a63f0e5a697b0bf4c587b +Subproject commit 50cb666b8a2e35a49b08c0f6bc81138c8f6f2ac1 From 643b73e15571ca005f54f81e22120707931408b9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 21 Apr 2023 19:38:54 -0400 Subject: [PATCH 055/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 80a354c..c47ab7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.35" +version = "0.1.36" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 3a7ee1c..624e12e 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.35", + version="0.1.36", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From e99caedbbd59dae4c7d913fd3d1fb6a4998cdb7f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 22 Apr 2023 19:50:28 -0400 Subject: [PATCH 056/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 39 +++++++++++++++++++++++++++++++++++++-- vendor/llama.cpp | 2 +- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 97c6565..2ffc4c5 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -172,7 +172,9 @@ _lib.llama_free.restype = None # TODO: not great API - very likely to change # Returns 0 on success # nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given -def llama_model_quantize(fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int) -> c_int: +def llama_model_quantize( + fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int +) -> c_int: return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread) @@ -187,7 +189,10 @@ _lib.llama_model_quantize.restype = c_int # will be applied on top of the previous one # Returns 0 on success def llama_apply_lora_from_file( - ctx: llama_context_p, path_lora: ctypes.c_char_p, path_base_model: ctypes.c_char_p, n_threads: c_int + ctx: llama_context_p, + path_lora: ctypes.c_char_p, + path_base_model: ctypes.c_char_p, + n_threads: c_int, ) -> c_int: return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) @@ -235,6 +240,36 @@ _lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, _lib.llama_set_kv_cache.restype = None +# Returns the size in bytes of the state (rng, logits, embedding and kv_cache) +def llama_get_state_size(ctx: llama_context_p) -> c_size_t: + return _lib.llama_get_state_size(ctx) + + +_lib.llama_get_state_size.argtypes = [llama_context_p] +_lib.llama_get_state_size.restype = c_size_t + + +# Copies the state to the specified destination address. +# Destination needs to have allocated enough memory. +# Returns the number of bytes copied +def llama_copy_state_data(ctx: llama_context_p, dest) -> c_size_t: + return _lib.llama_copy_state_data(ctx, dest) + + +_lib.llama_copy_state_data.argtypes = [llama_context_p, POINTER(c_uint8)] +_lib.llama_copy_state_data.restype = c_size_t + + +# Set the state reading from the specified address +# Returns the number of bytes read +def llama_set_state_data(ctx: llama_context_p, src) -> c_size_t: + return _lib.llama_set_state_data(ctx, src) + + +_lib.llama_set_state_data.argtypes = [llama_context_p, POINTER(c_uint8)] +_lib.llama_set_state_data.restype = c_size_t + + # Run the llama inference to obtain the logits and probabilities for the next token. # tokens + n_tokens is the provided batch of new tokens to process # n_past is the number of tokens to use from previous eval calls diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 50cb666..0e018fe 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 50cb666b8a2e35a49b08c0f6bc81138c8f6f2ac1 +Subproject commit 0e018fe008eacebdbcfa2d61b6c988c245c961cd From 723059959302d109f3468f2426b442af1469542e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 23 Apr 2023 14:53:17 -0400 Subject: [PATCH 057/443] Disable mmap when applying lora weights. Closes #107 --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ea9f0ff..70dcea9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -79,7 +79,7 @@ class Llama: self.params.f16_kv = f16_kv self.params.logits_all = logits_all self.params.vocab_only = vocab_only - self.params.use_mmap = use_mmap + self.params.use_mmap = use_mmap if lora_path is None else False self.params.use_mlock = use_mlock self.params.embedding = embedding From aa12d8a81f5b2cf6d9b7a037fa69bdec6ca036b1 Mon Sep 17 00:00:00 2001 From: eiery <19350831+eiery@users.noreply.github.com> Date: Sun, 23 Apr 2023 20:56:40 -0400 Subject: [PATCH 058/443] Update llama.py update n_batch default to 512 to match upstream llama.cpp --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ea9f0ff..a414a1c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -37,7 +37,7 @@ class Llama: use_mlock: bool = False, embedding: bool = False, n_threads: Optional[int] = None, - n_batch: int = 8, + n_batch: int = 512, last_n_tokens_size: int = 64, lora_base: Optional[str] = None, lora_path: Optional[str] = None, From 8476b325f127d66477424f6767c3330fa520728e Mon Sep 17 00:00:00 2001 From: Niek van der Maas Date: Mon, 24 Apr 2023 09:54:38 +0200 Subject: [PATCH 059/443] Change to bullseye --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 5bd28b7..ade4ac9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3-buster +FROM python:3-bullseye # We need to set the host to 0.0.0.0 to allow outside access ENV HOST 0.0.0.0 From 02cf88131781f4aec6754be1468095ba2c9ea730 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Apr 2023 09:30:10 -0400 Subject: [PATCH 060/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 30 ------------------------------ vendor/llama.cpp | 2 +- 2 files changed, 1 insertion(+), 31 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 2ffc4c5..2b5af66 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -201,25 +201,6 @@ _lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, _lib.llama_apply_lora_from_file.restype = c_int -# Returns the KV cache that will contain the context for the -# ongoing prediction with the model. -def llama_get_kv_cache(ctx: llama_context_p): - return _lib.llama_get_kv_cache(ctx) - - -_lib.llama_get_kv_cache.argtypes = [llama_context_p] -_lib.llama_get_kv_cache.restype = POINTER(c_uint8) - - -# Returns the size of the KV cache -def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t: - return _lib.llama_get_kv_cache_size(ctx) - - -_lib.llama_get_kv_cache_size.argtypes = [llama_context_p] -_lib.llama_get_kv_cache_size.restype = c_size_t - - # Returns the number of tokens in the KV cache def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: return _lib.llama_get_kv_cache_token_count(ctx) @@ -229,17 +210,6 @@ _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] _lib.llama_get_kv_cache_token_count.restype = c_int -# Sets the KV cache containing the current context for the model -def llama_set_kv_cache( - ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int -): - return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count) - - -_lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int] -_lib.llama_set_kv_cache.restype = None - - # Returns the size in bytes of the state (rng, logits, embedding and kv_cache) def llama_get_state_size(ctx: llama_context_p) -> c_size_t: return _lib.llama_get_state_size(ctx) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 0e018fe..c4fe84f 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 0e018fe008eacebdbcfa2d61b6c988c245c961cd +Subproject commit c4fe84fb0d28851a5c10e5a633f82ae2ba3b7fae From 86f8e5ad9162a57a72d3af598e477d0971e89eb7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Apr 2023 15:47:54 -0400 Subject: [PATCH 061/443] Refactor internal state for Llama class --- llama_cpp/llama.py | 63 +++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 40 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 70dcea9..f7a6e9e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -84,16 +84,9 @@ class Llama: self.params.embedding = embedding self.last_n_tokens_size = last_n_tokens_size - self.last_n_tokens_data = deque( - [llama_cpp.llama_token(0)] * self.last_n_tokens_size, - maxlen=self.last_n_tokens_size, - ) - self.tokens_consumed = 0 - self.tokens: List[llama_cpp.llama_token] = [] self.n_batch = min(n_ctx, n_batch) - self.n_tokens = 0 - self.n_past = 0 - self.all_logits: List[List[float]] = [] # TODO: Use an array instead of a list. + self.eval_tokens: deque[llama_cpp.llama_token] = deque(maxlen=n_ctx) + self.eval_logits: deque[List[float]] = deque(maxlen=n_ctx) ### HACK: This is a hack to work around the fact that the llama.cpp API does not yet support ### saving and restoring state, this allows us to continue a completion if the last @@ -181,14 +174,8 @@ class Llama: def reset(self): """Reset the model state.""" - self.last_n_tokens_data.extend( - [llama_cpp.llama_token(0)] * self.last_n_tokens_size - ) - self.tokens_consumed = 0 - self.tokens.clear() - self.n_tokens = 0 - self.n_past = 0 - self.all_logits.clear() + self.eval_tokens.clear() + self.eval_logits.clear() def eval(self, tokens: Sequence[llama_cpp.llama_token]): """Evaluate a list of tokens. @@ -200,32 +187,25 @@ class Llama: n_ctx = int(llama_cpp.llama_n_ctx(self.ctx)) for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] - self.n_past = min(n_ctx - len(batch), self.tokens_consumed) - self.n_tokens = len(batch) + n_past = min(n_ctx - len(batch), len(self.eval_tokens)) + n_tokens = len(batch) return_code = llama_cpp.llama_eval( ctx=self.ctx, tokens=(llama_cpp.llama_token * len(batch))(*batch), - n_tokens=llama_cpp.c_int(self.n_tokens), - n_past=llama_cpp.c_int(self.n_past), + n_tokens=llama_cpp.c_int(n_tokens), + n_past=llama_cpp.c_int(n_past), n_threads=llama_cpp.c_int(self.n_threads), ) if int(return_code) != 0: raise RuntimeError(f"llama_eval returned {return_code}") - self.tokens.extend(batch) - self.last_n_tokens_data.extend(batch) - self.tokens_consumed += len(batch) + self.eval_tokens.extend(batch) if self.params.logits_all: - self.all_logits.extend(self._logits()) - - def _logits(self) -> List[List[float]]: - """Return the logits from the last call to llama_eval.""" - assert self.ctx is not None - n_vocab = llama_cpp.llama_n_vocab(self.ctx) - cols = int(n_vocab) - rows = self.n_tokens if self.params.logits_all else 1 - logits_view = llama_cpp.llama_get_logits(self.ctx) - logits = [[logits_view[i * cols + j] for j in range(cols)] for i in range(rows)] - return logits + n_vocab = llama_cpp.llama_n_vocab(self.ctx) + cols = int(n_vocab) + rows = n_tokens + logits_view = llama_cpp.llama_get_logits(self.ctx) + logits = [[logits_view[i * cols + j] for j in range(cols)] for i in range(rows)] + self.eval_logits.extend(logits) def sample( self, @@ -246,10 +226,13 @@ class Llama: The sampled token. """ assert self.ctx is not None + last_n_tokens_data = [llama_cpp.llama_token(0)] * max( + 0, self.last_n_tokens_size - len(self.eval_tokens) + ) + list(self.eval_tokens)[-self.last_n_tokens_size :] return llama_cpp.llama_sample_top_p_top_k( ctx=self.ctx, last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)( - *self.last_n_tokens_data + *last_n_tokens_data ), last_n_tokens_size=llama_cpp.c_int(self.last_n_tokens_size), top_k=llama_cpp.c_int(top_k), @@ -293,13 +276,13 @@ class Llama: if ( reset and self._cache - and len(self.tokens) > 0 - and self.tokens == tokens[: len(self.tokens)] + and len(self.eval_tokens) > 0 + and self.eval_tokens == tokens[: len(self.eval_tokens)] ): if self.verbose: print("generate cache hit", file=sys.stderr) reset = False - tokens = tokens[len(self.tokens) :] + tokens = tokens[len(self.eval_tokens) :] ### if reset: self.reset() @@ -537,7 +520,7 @@ class Llama: ] all_logprobs = [ [Llama.logit_to_logprob(logit) for logit in row] - for row in self.all_logits + for row in self.eval_logits ] for token, token_str, logprobs_token in zip( all_tokens, all_token_strs, all_logprobs From 280a047dd66cf6356690de0e881ddd7a9c88a51a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Apr 2023 15:52:24 -0400 Subject: [PATCH 062/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c4fe84f..8a0f867 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c4fe84fb0d28851a5c10e5a633f82ae2ba3b7fae +Subproject commit 8a0f8673ba1cdc6aa6df27a9fbc698431ca70e8d From c4c332fc51f9dd0fbad813997b8dab4213812b87 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Apr 2023 17:42:09 -0400 Subject: [PATCH 063/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8a0f867..54bb60e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8a0f8673ba1cdc6aa6df27a9fbc698431ca70e8d +Subproject commit 54bb60e26858be251a0eb3cb70f80322aff804a0 From 197cf80601a3ac4efe2ec3dfe17cdc1397b68975 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Apr 2023 17:51:25 -0400 Subject: [PATCH 064/443] Add save/load state api for Llama class --- llama_cpp/llama.py | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index f7a6e9e..c857bbe 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -4,7 +4,7 @@ import uuid import time import math import multiprocessing -from typing import List, Optional, Union, Generator, Sequence, Iterator +from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque from collections import deque from . import llama_cpp @@ -20,6 +20,18 @@ class LlamaCache: pass +class LlamaState: + def __init__( + self, + eval_tokens: Deque[llama_cpp.llama_token], + eval_logits: Deque[List[float]], + llama_state, + ): + self.eval_tokens = eval_tokens + self.eval_logits = eval_logits + self.llama_state = llama_state + + class Llama: """High-level Python wrapper for a llama.cpp model.""" @@ -85,8 +97,8 @@ class Llama: self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) - self.eval_tokens: deque[llama_cpp.llama_token] = deque(maxlen=n_ctx) - self.eval_logits: deque[List[float]] = deque(maxlen=n_ctx) + self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx) + self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx) ### HACK: This is a hack to work around the fact that the llama.cpp API does not yet support ### saving and restoring state, this allows us to continue a completion if the last @@ -204,7 +216,10 @@ class Llama: cols = int(n_vocab) rows = n_tokens logits_view = llama_cpp.llama_get_logits(self.ctx) - logits = [[logits_view[i * cols + j] for j in range(cols)] for i in range(rows)] + logits = [ + [logits_view[i * cols + j] for j in range(cols)] + for i in range(rows) + ] self.eval_logits.extend(logits) def sample( @@ -828,6 +843,26 @@ class Llama: verbose=state["verbose"], ) + def save_state(self) -> LlamaState: + assert self.ctx is not None + state_size = llama_cpp.llama_get_state_size(self.ctx) + llama_state = (llama_cpp.c_uint8 * int(state_size))() + if llama_cpp.llama_copy_state_data(self.ctx, llama_state) != state_size: + raise RuntimeError("Failed to copy llama state data") + return LlamaState( + eval_tokens=self.eval_tokens.copy(), + eval_logits=self.eval_logits.copy(), + llama_state=llama_state, + ) + + def load_state(self, state: LlamaState) -> None: + assert self.ctx is not None + self.eval_tokens = state.eval_tokens.copy() + self.eval_logits = state.eval_logits.copy() + state_size = llama_cpp.llama_get_state_size(self.ctx) + if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size: + raise RuntimeError("Failed to set llama state data") + @staticmethod def token_eos() -> llama_cpp.llama_token: """Return the end-of-sequence token.""" From cbe95bbb75ba72cbb39308ee645d3bf1e5507a86 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Apr 2023 19:54:41 -0400 Subject: [PATCH 065/443] Add cache implementation using llama state --- llama_cpp/llama.py | 64 +++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c2d9d10..0a69b2c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -12,12 +12,22 @@ from .llama_types import * class LlamaCache: - """Cache for a llama.cpp model. + """Cache for a llama.cpp model.""" - NOTE: This implementation currently only tells the Llama class to avoid reprocessing bytes and continue from the last - completion. It does not actually cache the results.""" + def __init__(self): + self.cache_state: Dict[Sequence[llama_cpp.llama_token], "LlamaState"] = dict() - pass + def __getitem__( + self, key: Sequence[llama_cpp.llama_token] + ) -> Optional["LlamaState"]: + return self.cache_state.get(tuple(key), None) + + def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool: + return tuple(key) in self.cache_state + + def __setitem__(self, key: Sequence[llama_cpp.llama_token], value: "LlamaState"): + self.cache_state = dict() # NOTE: Currently limit to one cache entry. + self.cache_state[tuple(key)] = value class LlamaState: @@ -100,13 +110,7 @@ class Llama: self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx) self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx) - ### HACK: This is a hack to work around the fact that the llama.cpp API does not yet support - ### saving and restoring state, this allows us to continue a completion if the last - ### completion_bytes is a prefix to the prompt passed in. However this is actually incorrect - ### because it does not take into account stop tokens which have been processed by the model. - self._completion_bytes: List[bytes] = [] - self._cache: Optional[LlamaCache] = None - ### + self.cache: Optional[LlamaCache] = None self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) @@ -182,7 +186,7 @@ class Llama: Args: cache: The cache to set. """ - self._cache = cache + self.cache = cache def reset(self): """Reset the model state.""" @@ -287,10 +291,9 @@ class Llama: The generated tokens. """ assert self.ctx is not None - ### HACK + if ( reset - and self._cache and len(self.eval_tokens) > 0 and self.eval_tokens == tokens[: len(self.eval_tokens)] ): @@ -298,7 +301,7 @@ class Llama: print("generate cache hit", file=sys.stderr) reset = False tokens = tokens[len(self.eval_tokens) :] - ### + if reset: self.reset() while True: @@ -415,20 +418,10 @@ class Llama: "logprobs is not supported for models created with logits_all=False" ) - ### HACK - reset: bool = True - _prompt: bytes = prompt.encode("utf-8") - _completion: bytes = b"".join(self._completion_bytes) - if len(_completion) and self._cache and _prompt.startswith(_completion): + if self.cache and prompt_tokens in self.cache: if self.verbose: - print("completion cache hit", file=sys.stderr) - reset = False - _prompt = _prompt[len(_completion) :] - prompt_tokens = self.tokenize(b" " + _prompt) - self._completion_bytes.append(_prompt) - else: - self._completion_bytes = [prompt.encode("utf-8")] - ### + print("cache hit", file=sys.stderr) + self.load_state(self.cache[prompt_tokens]) finish_reason = "length" for token in self.generate( @@ -437,12 +430,16 @@ class Llama: top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty, - reset=reset, ): if token == llama_cpp.llama_token_eos(): text = self.detokenize(completion_tokens) finish_reason = "stop" break + + if self.cache and len(completion_tokens) == 0: + if prompt_tokens not in self.cache: + self.cache[prompt_tokens] = self.save_state() + completion_tokens.append(token) all_text = self.detokenize(completion_tokens) @@ -467,9 +464,6 @@ class Llama: break text = all_text[: len(all_text) - longest] returned_characters += len(text[start:]) - ### HACK - self._completion_bytes.append(text[start:]) - ### yield { "id": completion_id, "object": "text_completion", @@ -491,9 +485,6 @@ class Llama: break if stream: - ### HACK - self._completion_bytes.append(text[returned_characters:]) - ### yield { "id": completion_id, "object": "text_completion", @@ -510,9 +501,6 @@ class Llama: } return - ### HACK - self._completion_bytes.append(text) - ### text_str = text.decode("utf-8") if echo: From b75fa96bf7995e0f252ee7295262fb745f3d8290 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Apr 2023 19:56:57 -0400 Subject: [PATCH 066/443] Update docs --- docs/index.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/index.md b/docs/index.md index 5424e26..c36adff 100644 --- a/docs/index.md +++ b/docs/index.md @@ -105,12 +105,16 @@ python3 setup.py develop - __call__ - create_chat_completion - set_cache + - save_state + - load_state - token_bos - token_eos show_root_heading: true ::: llama_cpp.LlamaCache +::: llama_cpp.LlamaState + ::: llama_cpp.llama_cpp options: show_if_no_docstring: true From d484c5634eed2b65cd6de2b3ff1e606031c1f67b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Apr 2023 22:18:54 -0400 Subject: [PATCH 067/443] Bugfix: Check cache keys as prefix to prompt tokens --- llama_cpp/llama.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 0a69b2c..487f44d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -4,7 +4,7 @@ import uuid import time import math import multiprocessing -from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque +from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple from collections import deque from . import llama_cpp @@ -15,15 +15,34 @@ class LlamaCache: """Cache for a llama.cpp model.""" def __init__(self): - self.cache_state: Dict[Sequence[llama_cpp.llama_token], "LlamaState"] = dict() + self.cache_state: Dict[Tuple[llama_cpp.llama_token, ...], "LlamaState"] = dict() + + def _sorted_keys(self) -> List[Tuple[llama_cpp.llama_token, ...]]: + return [ + key + for _, key in sorted( + ((len(key), key) for key in self.cache_state.keys()), reverse=True + ) + ] + + def _find_key( + self, key: Tuple[llama_cpp.llama_token, ...] + ) -> Optional[Tuple[llama_cpp.llama_token, ...]]: + for k in self._sorted_keys(): + if key[: len(k)] == k: + return k + return None def __getitem__( self, key: Sequence[llama_cpp.llama_token] ) -> Optional["LlamaState"]: - return self.cache_state.get(tuple(key), None) + _key = self._find_key(tuple(key)) + if _key is None: + return None + return self.cache_state[_key] def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool: - return tuple(key) in self.cache_state + return self._find_key(tuple(key)) is not None def __setitem__(self, key: Sequence[llama_cpp.llama_token], value: "LlamaState"): self.cache_state = dict() # NOTE: Currently limit to one cache entry. @@ -295,7 +314,7 @@ class Llama: if ( reset and len(self.eval_tokens) > 0 - and self.eval_tokens == tokens[: len(self.eval_tokens)] + and tuple(self.eval_tokens) == tuple(tokens[: len(self.eval_tokens)]) ): if self.verbose: print("generate cache hit", file=sys.stderr) @@ -438,6 +457,8 @@ class Llama: if self.cache and len(completion_tokens) == 0: if prompt_tokens not in self.cache: + if self.verbose: + print("cache miss", file=sys.stderr) self.cache[prompt_tokens] = self.save_state() completion_tokens.append(token) From 9dddb3a607b522b18d254d3bf77e391f290e819e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 25 Apr 2023 00:19:44 -0400 Subject: [PATCH 068/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c47ab7a..773d0c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.36" +version = "0.1.37" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 624e12e..ed3b48e 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.36", + version="0.1.37", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 848c83dfd0e116b94d595d9303cac5e9e444669a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 25 Apr 2023 01:36:37 -0400 Subject: [PATCH 069/443] Add FORCE_CMAKE option --- CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 27e06ac..bda2388 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,11 @@ cmake_minimum_required(VERSION 3.4...3.22) project(llama_cpp) -if (UNIX) +option(FORCE_CMAKE "Force CMake build of Python bindings" OFF) + +set(FORCE_CMAKE $ENV{FORCE_CMAKE}) + +if (UNIX AND NOT FORCE_CMAKE) add_custom_command( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so COMMAND make libllama.so From 996e31d861ce5c8cfbefe6af52a3da25cf484454 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 25 Apr 2023 01:37:07 -0400 Subject: [PATCH 070/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 773d0c2..3e416d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.37" +version = "0.1.38" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index ed3b48e..20db9a7 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.37", + version="0.1.38", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From cc706fb94448f7d9c0db89eebd7188d738c6831d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 25 Apr 2023 09:00:53 -0400 Subject: [PATCH 071/443] Add ctx check and re-order __init__. Closes #112 --- llama_cpp/llama.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 487f44d..df9a491 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -133,6 +133,9 @@ class Llama: self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) + self.lora_base = lora_base + self.lora_path = lora_path + if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") @@ -140,8 +143,8 @@ class Llama: self.model_path.encode("utf-8"), self.params ) - self.lora_base = lora_base - self.lora_path = lora_path + assert self.ctx is not None + if self.lora_path: if llama_cpp.llama_apply_lora_from_file( self.ctx, From 3cab3ef4cb1ae39ad19ffe2b58cdf6671dd82e43 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 25 Apr 2023 09:11:32 -0400 Subject: [PATCH 072/443] Update n_batch for server --- llama_cpp/server/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index b2ec4de..af6cc38 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -28,7 +28,7 @@ from sse_starlette.sse import EventSourceResponse class Settings(BaseSettings): model: str n_ctx: int = 2048 - n_batch: int = 8 + n_batch: int = 512 n_threads: int = max((os.cpu_count() or 2) // 2, 1) f16_kv: bool = True use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... From cbd26fdcc116dc692308f2d262083dfd1ddaa142 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 25 Apr 2023 19:03:41 -0400 Subject: [PATCH 073/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 1 + vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 2b5af66..1097d74 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -119,6 +119,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( ) # tok_embeddings.weight and output.weight are F16 LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors LLAMA_FTYPE_MOSTYL_Q4_3 = ctypes.c_int(6) # except 1d tensors +LLAMA_FTYPE_MOSTYL_Q8_0 = ctypes.c_int(7) # except 1d tensors # Functions diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 54bb60e..4afcc37 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 54bb60e26858be251a0eb3cb70f80322aff804a0 +Subproject commit 4afcc378698e057fcde64e23eb664e5af8dd6956 From c4a8491d42b3b93330408afc3cc2af31ae2fecb1 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Wed, 26 Apr 2023 14:37:06 +0200 Subject: [PATCH 074/443] Fix decode errors permanently --- examples/low_level_api/low_level_api_chat_cpp.py | 9 ++++++--- examples/low_level_api/low_level_api_llama_cpp.py | 2 +- llama_cpp/llama.py | 12 ++++++------ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index d64ee8f..4a7cfc1 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -96,7 +96,7 @@ specified) expect poor results""", file=sys.stderr) print(file=sys.stderr) print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ -| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr) +| {llama_cpp.llama_print_system_info().decode('utf8', errors='ignore')}", file=sys.stderr) # determine the required inference memory per token: if (self.params.mem_test): @@ -342,7 +342,7 @@ n_keep = {self.params.n_keep} # return past text def past(self): for id in self.last_n_tokens[-self.n_past:]: - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore") # write input def input(self, prompt: str): @@ -356,7 +356,10 @@ n_keep = {self.params.n_keep} def output(self): self.remaining_tokens = self.params.n_predict for id in self.generate(): - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + try: + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore") + except UnicodeDecodeError: + pass # read user input def read_input(self): diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py index b048c0a..4fb5a03 100644 --- a/examples/low_level_api/low_level_api_llama_cpp.py +++ b/examples/low_level_api/low_level_api_llama_cpp.py @@ -70,7 +70,7 @@ while remaining_tokens > 0: if not input_noecho: for id in embd: print( - llama_cpp.llama_token_to_str(ctx, id).decode("utf-8"), + llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"), end="", flush=True, ) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index edd2eef..a6e7ae3 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -109,7 +109,7 @@ class Llama: ) if self.verbose: - print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + print(llama_cpp.llama_print_system_info().decode("utf-8", errors="ignore"), file=sys.stderr) def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]: """Tokenize a string. @@ -460,7 +460,7 @@ class Llama: "model": self.model_path, "choices": [ { - "text": text[start:].decode("utf-8"), + "text": text[start:].decode("utf-8", errors="ignore"), "index": 0, "logprobs": None, "finish_reason": None, @@ -484,7 +484,7 @@ class Llama: "model": self.model_path, "choices": [ { - "text": text[returned_characters:].decode("utf-8"), + "text": text[returned_characters:].decode("utf-8", errors="ignore"), "index": 0, "logprobs": None, "finish_reason": finish_reason, @@ -496,7 +496,7 @@ class Llama: ### HACK self._completion_bytes.append(text) ### - text_str = text.decode("utf-8") + text_str = text.decode("utf-8", errors="ignore") if echo: text_str = prompt + text_str @@ -514,7 +514,7 @@ class Llama: all_tokens = prompt_tokens + completion_tokens all_token_strs = [ - self.detokenize([token]).decode("utf-8") for token in all_tokens + self.detokenize([token]).decode("utf-8", errors="ignore") for token in all_tokens ] all_logprobs = [ [Llama.logit_to_logprob(logit) for logit in row] @@ -533,7 +533,7 @@ class Llama: ) token_logprobs.append(sorted_logprobs[int(token)][0]) top_logprob = { - self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8"): logprob + self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8", errors="ignore"): logprob for logprob, i in sorted_logprobs[:logprobs] } top_logprob.update({token_str: sorted_logprobs[int(token)][0]}) From 3c130f00ca65943fc4ac3db7d11cf9ca83cd5c2a Mon Sep 17 00:00:00 2001 From: Mug <> Date: Wed, 26 Apr 2023 14:38:53 +0200 Subject: [PATCH 075/443] Remove try catch from chat --- examples/low_level_api/low_level_api_chat_cpp.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 4a7cfc1..c383bf6 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -356,10 +356,7 @@ n_keep = {self.params.n_keep} def output(self): self.remaining_tokens = self.params.n_predict for id in self.generate(): - try: - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore") - except UnicodeDecodeError: - pass + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore") # read user input def read_input(self): From 5f81400fcb2898e9eb6b13f32dc066052d7efeef Mon Sep 17 00:00:00 2001 From: Mug <> Date: Wed, 26 Apr 2023 14:45:51 +0200 Subject: [PATCH 076/443] Also ignore errors on input prompts --- examples/low_level_api/low_level_api_chat_cpp.py | 2 +- llama_cpp/llama.py | 6 +++--- tests/test_llama.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index c383bf6..90b2fcb 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -201,7 +201,7 @@ n_keep = {self.params.n_keep} # tokenize a prompt def _tokenize(self, prompt, bos=True): _arr = (llama_cpp.llama_token * (len(prompt) + 1))() - _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos) + _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos) return _arr[:_n] def set_color(self, c): diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index f442648..41e8c0a 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -358,7 +358,7 @@ class Llama: if self.verbose: llama_cpp.llama_reset_timings(self.ctx) - tokens = self.tokenize(input.encode("utf-8")) + tokens = self.tokenize(input.encode("utf-8", errors="ignore")) self.reset() self.eval(tokens) n_tokens = len(tokens) @@ -416,7 +416,7 @@ class Llama: completion_tokens: List[llama_cpp.llama_token] = [] # Add blank space to start of prompt to match OG llama tokenizer prompt_tokens: List[llama_cpp.llama_token] = self.tokenize( - b" " + prompt.encode("utf-8") + b" " + prompt.encode("utf-8", errors="ignore") ) text: bytes = b"" returned_characters: int = 0 @@ -431,7 +431,7 @@ class Llama: ) if stop != []: - stop_sequences = [s.encode("utf-8") for s in stop] + stop_sequences = [s.encode("utf-8", errors="ignore") for s in stop] else: stop_sequences = [] diff --git a/tests/test_llama.py b/tests/test_llama.py index 6a50256..4dab687 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -24,7 +24,7 @@ def test_llama_patch(monkeypatch): monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) output_text = " jumps over the lazy dog." - output_tokens = llama.tokenize(output_text.encode("utf-8")) + output_tokens = llama.tokenize(output_text.encode("utf-8", errors="ignore")) token_eos = llama.token_eos() n = 0 From 9339929f56ca71adb97930679c710a2458f877bd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 26 Apr 2023 20:00:54 -0400 Subject: [PATCH 077/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 8 ++++++++ vendor/llama.cpp | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 1097d74..7ec0418 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -120,6 +120,8 @@ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors LLAMA_FTYPE_MOSTYL_Q4_3 = ctypes.c_int(6) # except 1d tensors LLAMA_FTYPE_MOSTYL_Q8_0 = ctypes.c_int(7) # except 1d tensors +LLAMA_FTYPE_MOSTYL_Q5_0 = ctypes.c_int(8) # except 1d tensors +LLAMA_FTYPE_MOSTYL_Q5_1 = ctypes.c_int(9) # except 1d tensors # Functions @@ -210,6 +212,12 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] _lib.llama_get_kv_cache_token_count.restype = c_int +# Sets the current rng seed. +def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): + return _lib.llama_set_rng_seed(ctx, seed) + +_lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int] +_lib.llama_set_rng_seed.restype = None # Returns the size in bytes of the state (rng, logits, embedding and kv_cache) def llama_get_state_size(ctx: llama_context_p) -> c_size_t: diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4afcc37..0b2da20 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4afcc378698e057fcde64e23eb664e5af8dd6956 +Subproject commit 0b2da20538d01926b77ea237dd1c930c4d20b686 From c39547a986540d1152493db45ed461dde04f0ffa Mon Sep 17 00:00:00 2001 From: Mug <> Date: Fri, 28 Apr 2023 12:50:30 +0200 Subject: [PATCH 078/443] Detect multi-byte responses and wait --- examples/low_level_api/low_level_api_chat_cpp.py | 2 +- llama_cpp/llama.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 90b2fcb..6fced65 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -96,7 +96,7 @@ specified) expect poor results""", file=sys.stderr) print(file=sys.stderr) print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ -| {llama_cpp.llama_print_system_info().decode('utf8', errors='ignore')}", file=sys.stderr) +| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr) # determine the required inference memory per token: if (self.params.mem_test): diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 41e8c0a..630af18 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -159,7 +159,7 @@ class Llama: ) if self.verbose: - print(llama_cpp.llama_print_system_info().decode("utf-8", errors="ignore"), file=sys.stderr) + print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]: """Tokenize a string. @@ -446,6 +446,7 @@ class Llama: self.load_state(self.cache[prompt_tokens]) finish_reason = "length" + multibyte_fix = 0 for token in self.generate( prompt_tokens, top_k=top_k, @@ -458,6 +459,12 @@ class Llama: finish_reason = "stop" break + # Contains multi-byte UTF8 + for num,pattern in [(2, 192), (3, 224), (4, 240)]: + # Bitwise AND check + if (pattern & token == pattern): + multibyte_fix = num + if self.cache and len(completion_tokens) == 0: if prompt_tokens not in self.cache: if self.verbose: @@ -466,6 +473,11 @@ class Llama: completion_tokens.append(token) + # Stop incomplete bytes from passing + if (multibyte_fix > 0): + multibyte_fix -= 1 + continue + all_text = self.detokenize(completion_tokens) any_stop = [s for s in stop_sequences if s in all_text] if len(any_stop) > 0: From 3a987470261b26f7a005b784863b282645326dc6 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Fri, 28 Apr 2023 12:54:28 +0200 Subject: [PATCH 079/443] One day, i'll fix off by 1 errors permanently too --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 630af18..5adeaf8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -463,7 +463,7 @@ class Llama: for num,pattern in [(2, 192), (3, 224), (4, 240)]: # Bitwise AND check if (pattern & token == pattern): - multibyte_fix = num + multibyte_fix = num - 1 if self.cache and len(completion_tokens) == 0: if prompt_tokens not in self.cache: From eed61289b68903ad9ca01f85976e9ababbbb1291 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Fri, 28 Apr 2023 13:16:18 +0200 Subject: [PATCH 080/443] Dont detect off tokens, detect off detokenized utf8 --- llama_cpp/llama.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5adeaf8..92715b5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -459,12 +459,6 @@ class Llama: finish_reason = "stop" break - # Contains multi-byte UTF8 - for num,pattern in [(2, 192), (3, 224), (4, 240)]: - # Bitwise AND check - if (pattern & token == pattern): - multibyte_fix = num - 1 - if self.cache and len(completion_tokens) == 0: if prompt_tokens not in self.cache: if self.verbose: @@ -473,12 +467,22 @@ class Llama: completion_tokens.append(token) + all_text = self.detokenize(completion_tokens) + + # Contains multi-byte UTF8 + for k,char in enumerate(all_text[-3:]): + k = 3 - k + char = int.from_bytes(char, "big") + for num,pattern in [(2, 192), (3, 224), (4, 240)]: + # Bitwise AND check + if (num > k and pattern & char == pattern): + multibyte_fix = num - k + # Stop incomplete bytes from passing if (multibyte_fix > 0): multibyte_fix -= 1 continue - all_text = self.detokenize(completion_tokens) any_stop = [s for s in stop_sequences if s in all_text] if len(any_stop) > 0: first_stop = any_stop[0] From b7d14efc8b7b62d97ed66694b0dca0e1e3b3b2f6 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Fri, 28 Apr 2023 13:20:31 +0200 Subject: [PATCH 081/443] Python weirdness --- llama_cpp/llama.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 92715b5..fe540f9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -472,7 +472,6 @@ class Llama: # Contains multi-byte UTF8 for k,char in enumerate(all_text[-3:]): k = 3 - k - char = int.from_bytes(char, "big") for num,pattern in [(2, 192), (3, 224), (4, 240)]: # Bitwise AND check if (num > k and pattern & char == pattern): From ea0faabae18780daa5289729ae20bb172410122a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 28 Apr 2023 15:32:43 -0400 Subject: [PATCH 082/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 41 +++++++++++++++++++++++++++++++++++++++++ vendor/llama.cpp | 2 +- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 7ec0418..c081935 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -212,13 +212,16 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] _lib.llama_get_kv_cache_token_count.restype = c_int + # Sets the current rng seed. def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): return _lib.llama_set_rng_seed(ctx, seed) + _lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int] _lib.llama_set_rng_seed.restype = None + # Returns the size in bytes of the state (rng, logits, embedding and kv_cache) def llama_get_state_size(ctx: llama_context_p) -> c_size_t: return _lib.llama_get_state_size(ctx) @@ -249,6 +252,44 @@ _lib.llama_set_state_data.argtypes = [llama_context_p, POINTER(c_uint8)] _lib.llama_set_state_data.restype = c_size_t +# Save/load session file +def llama_load_session_file( + ctx: llama_context_p, + path_session: bytes, + tokens_out, + n_token_capacity: c_size_t, + n_token_count_out, +) -> c_size_t: + return _lib.llama_load_session_file( + ctx, path_session, tokens_out, n_token_capacity, n_token_count_out + ) + + +_lib.llama_load_session_file.argtypes = [ + llama_context_p, + c_char_p, + llama_token_p, + c_size_t, + POINTER(c_size_t), +] +_lib.llama_load_session_file.restype = c_size_t + + +def llama_save_session_file( + ctx: llama_context_p, path_session: bytes, tokens, n_token_count: c_size_t +) -> c_size_t: + return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) + + +_lib.llama_save_session_file.argtypes = [ + llama_context_p, + c_char_p, + llama_token_p, + c_size_t, +] +_lib.llama_save_session_file.restype = c_size_t + + # Run the llama inference to obtain the logits and probabilities for the next token. # tokens + n_tokens is the provided batch of new tokens to process # n_past is the number of tokens to use from previous eval calls diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 0b2da20..7f15c5c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 0b2da20538d01926b77ea237dd1c930c4d20b686 +Subproject commit 7f15c5c477d9933689a9d1c40794483e350c2f19 From 5423d047c7e4c22b321e6f4df7ca87f08ce361a7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 28 Apr 2023 15:33:08 -0400 Subject: [PATCH 083/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3e416d0..798fcaf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.38" +version = "0.1.39" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 20db9a7..19693a1 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.38", + version="0.1.39", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From e00beb13b5c5fe98f43709afadc48a0331725391 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 28 Apr 2023 17:08:18 -0400 Subject: [PATCH 084/443] Update README --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 2c8c0a5..f1d9bd2 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,12 @@ Install from PyPI: pip install llama-cpp-python ``` +The above command will attempt to install the package and build build `llama.cpp` from source. +This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. + +This method defaults to using `make` to build `llama.cpp` on Linux / MacOS and `cmake` on Windows. +You can force the use of `cmake` on Linux / MacOS setting the `FORCE_CMAKE=1` environment variable before installing. + ## High-level API ```python From 523825e91d063119e50e1b813fbe971ad7efd13a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 28 Apr 2023 17:12:03 -0400 Subject: [PATCH 085/443] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f1d9bd2..906d055 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ This package provides: ## Installation -Install from PyPI: +Install from PyPI (requires a c compiler): ```bash pip install llama-cpp-python From 18a0c10032ef793b67bb8ea9e4ca9e3aaa791595 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Sat, 29 Apr 2023 12:19:22 +0200 Subject: [PATCH 086/443] Remove excessive errors="ignore" and add utf8 test --- llama_cpp/llama.py | 6 +++--- tests/test_llama.py | 38 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index fe540f9..4e3c3aa 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -358,7 +358,7 @@ class Llama: if self.verbose: llama_cpp.llama_reset_timings(self.ctx) - tokens = self.tokenize(input.encode("utf-8", errors="ignore")) + tokens = self.tokenize(input.encode("utf-8")) self.reset() self.eval(tokens) n_tokens = len(tokens) @@ -416,7 +416,7 @@ class Llama: completion_tokens: List[llama_cpp.llama_token] = [] # Add blank space to start of prompt to match OG llama tokenizer prompt_tokens: List[llama_cpp.llama_token] = self.tokenize( - b" " + prompt.encode("utf-8", errors="ignore") + b" " + prompt.encode("utf-8") ) text: bytes = b"" returned_characters: int = 0 @@ -431,7 +431,7 @@ class Llama: ) if stop != []: - stop_sequences = [s.encode("utf-8", errors="ignore") for s in stop] + stop_sequences = [s.encode("utf-8") for s in stop] else: stop_sequences = [] diff --git a/tests/test_llama.py b/tests/test_llama.py index 4dab687..4727d90 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -24,7 +24,7 @@ def test_llama_patch(monkeypatch): monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) output_text = " jumps over the lazy dog." - output_tokens = llama.tokenize(output_text.encode("utf-8", errors="ignore")) + output_tokens = llama.tokenize(output_text.encode("utf-8")) token_eos = llama.token_eos() n = 0 @@ -93,4 +93,38 @@ def test_llama_pickle(): text = b"Hello World" - assert llama.detokenize(llama.tokenize(text)) == text \ No newline at end of file + assert llama.detokenize(llama.tokenize(text)) == text + +def test_utf8(monkeypatch): + llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) + + ## Set up mock function + def mock_eval(*args, **kwargs): + return 0 + + monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) + + output_text = "😀" + output_tokens = llama.tokenize(output_text.encode("utf-8")) + token_eos = llama.token_eos() + n = 0 + + def mock_sample(*args, **kwargs): + nonlocal n + if n < len(output_tokens): + n += 1 + return output_tokens[n - 1] + else: + return token_eos + + monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_top_p_top_k", mock_sample) + + ## Test basic completion with utf8 multibyte + n = 0 # reset + completion = llama.create_completion("", max_tokens=4) + assert completion["choices"][0]["text"] == output_text + + ## Test basic completion with incomplete utf8 multibyte + n = 0 # reset + completion = llama.create_completion("", max_tokens=1) + assert completion["choices"][0]["text"] == "" From 468377b0e210c205944edafb0325779e87347581 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Fri, 28 Apr 2023 22:43:37 -0700 Subject: [PATCH 087/443] llama_cpp server: app is now importable, still runnable as a module --- llama_cpp/server/__init__.py | 0 llama_cpp/server/__main__.py | 281 ++--------------------------------- llama_cpp/server/app.py | 266 +++++++++++++++++++++++++++++++++ 3 files changed, 279 insertions(+), 268 deletions(-) create mode 100644 llama_cpp/server/__init__.py create mode 100644 llama_cpp/server/app.py diff --git a/llama_cpp/server/__init__.py b/llama_cpp/server/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index af6cc38..dd4767f 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -5,283 +5,28 @@ To run this example: ```bash pip install fastapi uvicorn sse-starlette export MODEL=../models/7B/... -uvicorn fastapi_server_chat:app --reload +``` + +Then run: +``` +uvicorn llama_cpp.server.app:app --reload +``` + +or + +``` +python3 -m llama_cpp.server ``` Then visit http://localhost:8000/docs to see the interactive API docs. """ import os -import json -from threading import Lock -from typing import List, Optional, Literal, Union, Iterator, Dict -from typing_extensions import TypedDict - -import llama_cpp - -from fastapi import Depends, FastAPI -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict -from sse_starlette.sse import EventSourceResponse - - -class Settings(BaseSettings): - model: str - n_ctx: int = 2048 - n_batch: int = 512 - n_threads: int = max((os.cpu_count() or 2) // 2, 1) - f16_kv: bool = True - use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... - use_mmap: bool = True - embedding: bool = True - last_n_tokens_size: int = 64 - logits_all: bool = False - cache: bool = False # WARNING: This is an experimental feature - - -app = FastAPI( - title="🦙 llama.cpp Python API", - version="0.0.1", -) -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) -settings = Settings() -llama = llama_cpp.Llama( - settings.model, - f16_kv=settings.f16_kv, - use_mlock=settings.use_mlock, - use_mmap=settings.use_mmap, - embedding=settings.embedding, - logits_all=settings.logits_all, - n_threads=settings.n_threads, - n_batch=settings.n_batch, - n_ctx=settings.n_ctx, - last_n_tokens_size=settings.last_n_tokens_size, -) -if settings.cache: - cache = llama_cpp.LlamaCache() - llama.set_cache(cache) -llama_lock = Lock() - - -def get_llama(): - with llama_lock: - yield llama - - -class CreateCompletionRequest(BaseModel): - prompt: Union[str, List[str]] - suffix: Optional[str] = Field(None) - max_tokens: int = 16 - temperature: float = 0.8 - top_p: float = 0.95 - echo: bool = False - stop: Optional[List[str]] = [] - stream: bool = False - - # ignored or currently unsupported - model: Optional[str] = Field(None) - n: Optional[int] = 1 - logprobs: Optional[int] = Field(None) - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - best_of: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) - - # llama.cpp specific parameters - top_k: int = 40 - repeat_penalty: float = 1.1 - - class Config: - schema_extra = { - "example": { - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": ["\n", "###"], - } - } - - -CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) - - -@app.post( - "/v1/completions", - response_model=CreateCompletionResponse, -) -def create_completion( - request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) -): - if isinstance(request.prompt, list): - request.prompt = "".join(request.prompt) - - completion_or_chunks = llama( - **request.dict( - exclude={ - "model", - "n", - "frequency_penalty", - "presence_penalty", - "best_of", - "logit_bias", - "user", - } - ) - ) - if request.stream: - chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks # type: ignore - return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks) - completion: llama_cpp.Completion = completion_or_chunks # type: ignore - return completion - - -class CreateEmbeddingRequest(BaseModel): - model: Optional[str] - input: str - user: Optional[str] - - class Config: - schema_extra = { - "example": { - "input": "The food was delicious and the waiter...", - } - } - - -CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) - - -@app.post( - "/v1/embeddings", - response_model=CreateEmbeddingResponse, -) -def create_embedding( - request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) -): - return llama.create_embedding(**request.dict(exclude={"model", "user"})) - - -class ChatCompletionRequestMessage(BaseModel): - role: Union[Literal["system"], Literal["user"], Literal["assistant"]] - content: str - user: Optional[str] = None - - -class CreateChatCompletionRequest(BaseModel): - model: Optional[str] - messages: List[ChatCompletionRequestMessage] - temperature: float = 0.8 - top_p: float = 0.95 - stream: bool = False - stop: Optional[List[str]] = [] - max_tokens: int = 128 - - # ignored or currently unsupported - model: Optional[str] = Field(None) - n: Optional[int] = 1 - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) - - # llama.cpp specific parameters - repeat_penalty: float = 1.1 - - class Config: - schema_extra = { - "example": { - "messages": [ - ChatCompletionRequestMessage( - role="system", content="You are a helpful assistant." - ), - ChatCompletionRequestMessage( - role="user", content="What is the capital of France?" - ), - ] - } - } - - -CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) - - -@app.post( - "/v1/chat/completions", - response_model=CreateChatCompletionResponse, -) -def create_chat_completion( - request: CreateChatCompletionRequest, - llama: llama_cpp.Llama = Depends(get_llama), -) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: - completion_or_chunks = llama.create_chat_completion( - **request.dict( - exclude={ - "model", - "n", - "presence_penalty", - "frequency_penalty", - "logit_bias", - "user", - } - ), - ) - - if request.stream: - - async def server_sent_events( - chat_chunks: Iterator[llama_cpp.ChatCompletionChunk], - ): - for chat_chunk in chat_chunks: - yield dict(data=json.dumps(chat_chunk)) - yield dict(data="[DONE]") - - chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore - - return EventSourceResponse( - server_sent_events(chunks), - ) - completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore - return completion - - -class ModelData(TypedDict): - id: str - object: Literal["model"] - owned_by: str - permissions: List[str] - - -class ModelList(TypedDict): - object: Literal["list"] - data: List[ModelData] - - -GetModelResponse = create_model_from_typeddict(ModelList) - - -@app.get("/v1/models", response_model=GetModelResponse) -def get_models() -> ModelList: - return { - "object": "list", - "data": [ - { - "id": llama.model_path, - "object": "model", - "owned_by": "me", - "permissions": [], - } - ], - } +import uvicorn +from llama_cpp.server.app import app if __name__ == "__main__": - import os - import uvicorn uvicorn.run( app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py new file mode 100644 index 0000000..d296e14 --- /dev/null +++ b/llama_cpp/server/app.py @@ -0,0 +1,266 @@ +import os +import json +from threading import Lock +from typing import List, Optional, Literal, Union, Iterator, Dict +from typing_extensions import TypedDict + +import llama_cpp + +from fastapi import Depends, FastAPI +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict +from sse_starlette.sse import EventSourceResponse + + +class Settings(BaseSettings): + model: str = os.environ["MODEL"] + n_ctx: int = 2048 + n_batch: int = 512 + n_threads: int = max((os.cpu_count() or 2) // 2, 1) + f16_kv: bool = True + use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... + use_mmap: bool = True + embedding: bool = True + last_n_tokens_size: int = 64 + logits_all: bool = False + cache: bool = False # WARNING: This is an experimental feature + + +app = FastAPI( + title="🦙 llama.cpp Python API", + version="0.0.1", +) +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) +settings = Settings() +llama = llama_cpp.Llama( + settings.model, + f16_kv=settings.f16_kv, + use_mlock=settings.use_mlock, + use_mmap=settings.use_mmap, + embedding=settings.embedding, + logits_all=settings.logits_all, + n_threads=settings.n_threads, + n_batch=settings.n_batch, + n_ctx=settings.n_ctx, + last_n_tokens_size=settings.last_n_tokens_size, +) +if settings.cache: + cache = llama_cpp.LlamaCache() + llama.set_cache(cache) +llama_lock = Lock() + + +def get_llama(): + with llama_lock: + yield llama + + +class CreateCompletionRequest(BaseModel): + prompt: Union[str, List[str]] + suffix: Optional[str] = Field(None) + max_tokens: int = 16 + temperature: float = 0.8 + top_p: float = 0.95 + echo: bool = False + stop: Optional[List[str]] = [] + stream: bool = False + + # ignored or currently unsupported + model: Optional[str] = Field(None) + n: Optional[int] = 1 + logprobs: Optional[int] = Field(None) + presence_penalty: Optional[float] = 0 + frequency_penalty: Optional[float] = 0 + best_of: Optional[int] = 1 + logit_bias: Optional[Dict[str, float]] = Field(None) + user: Optional[str] = Field(None) + + # llama.cpp specific parameters + top_k: int = 40 + repeat_penalty: float = 1.1 + + class Config: + schema_extra = { + "example": { + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": ["\n", "###"], + } + } + + +CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) + + +@app.post( + "/v1/completions", + response_model=CreateCompletionResponse, +) +def create_completion( + request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) +): + if isinstance(request.prompt, list): + request.prompt = "".join(request.prompt) + + completion_or_chunks = llama( + **request.dict( + exclude={ + "model", + "n", + "frequency_penalty", + "presence_penalty", + "best_of", + "logit_bias", + "user", + } + ) + ) + if request.stream: + chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks # type: ignore + return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks) + completion: llama_cpp.Completion = completion_or_chunks # type: ignore + return completion + + +class CreateEmbeddingRequest(BaseModel): + model: Optional[str] + input: str + user: Optional[str] + + class Config: + schema_extra = { + "example": { + "input": "The food was delicious and the waiter...", + } + } + + +CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) + + +@app.post( + "/v1/embeddings", + response_model=CreateEmbeddingResponse, +) +def create_embedding( + request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) +): + return llama.create_embedding(**request.dict(exclude={"model", "user"})) + + +class ChatCompletionRequestMessage(BaseModel): + role: Union[Literal["system"], Literal["user"], Literal["assistant"]] + content: str + user: Optional[str] = None + + +class CreateChatCompletionRequest(BaseModel): + model: Optional[str] + messages: List[ChatCompletionRequestMessage] + temperature: float = 0.8 + top_p: float = 0.95 + stream: bool = False + stop: Optional[List[str]] = [] + max_tokens: int = 128 + + # ignored or currently unsupported + model: Optional[str] = Field(None) + n: Optional[int] = 1 + presence_penalty: Optional[float] = 0 + frequency_penalty: Optional[float] = 0 + logit_bias: Optional[Dict[str, float]] = Field(None) + user: Optional[str] = Field(None) + + # llama.cpp specific parameters + repeat_penalty: float = 1.1 + + class Config: + schema_extra = { + "example": { + "messages": [ + ChatCompletionRequestMessage( + role="system", content="You are a helpful assistant." + ), + ChatCompletionRequestMessage( + role="user", content="What is the capital of France?" + ), + ] + } + } + + +CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) + + +@app.post( + "/v1/chat/completions", + response_model=CreateChatCompletionResponse, +) +def create_chat_completion( + request: CreateChatCompletionRequest, + llama: llama_cpp.Llama = Depends(get_llama), +) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: + completion_or_chunks = llama.create_chat_completion( + **request.dict( + exclude={ + "model", + "n", + "presence_penalty", + "frequency_penalty", + "logit_bias", + "user", + } + ), + ) + + if request.stream: + + async def server_sent_events( + chat_chunks: Iterator[llama_cpp.ChatCompletionChunk], + ): + for chat_chunk in chat_chunks: + yield dict(data=json.dumps(chat_chunk)) + yield dict(data="[DONE]") + + chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore + + return EventSourceResponse( + server_sent_events(chunks), + ) + completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore + return completion + + +class ModelData(TypedDict): + id: str + object: Literal["model"] + owned_by: str + permissions: List[str] + + +class ModelList(TypedDict): + object: Literal["list"] + data: List[ModelData] + + +GetModelResponse = create_model_from_typeddict(ModelList) + + +@app.get("/v1/models", response_model=GetModelResponse) +def get_models() -> ModelList: + return { + "object": "list", + "data": [ + { + "id": llama.model_path, + "object": "model", + "owned_by": "me", + "permissions": [], + } + ], + } From 6d8db9d017b6b6b68bcff79cce5e770705ef016a Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Fri, 28 Apr 2023 23:26:07 -0700 Subject: [PATCH 088/443] tests: simple test for server module --- llama_cpp/server/app.py | 2 + poetry.lock | 95 ++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + tests/test_llama.py | 21 +++++++++ 4 files changed, 117 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index d296e14..2c50fcb 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -24,6 +24,7 @@ class Settings(BaseSettings): last_n_tokens_size: int = 64 logits_all: bool = False cache: bool = False # WARNING: This is an experimental feature + vocab_only: bool = False app = FastAPI( @@ -49,6 +50,7 @@ llama = llama_cpp.Llama( n_batch=settings.n_batch, n_ctx=settings.n_ctx, last_n_tokens_size=settings.last_n_tokens_size, + vocab_only=settings.vocab_only, ) if settings.cache: cache = llama_cpp.LlamaCache() diff --git a/poetry.lock b/poetry.lock index 8a74d2f..a505168 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,25 @@ -# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. + +[[package]] +name = "anyio" +version = "3.6.2" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +category = "dev" +optional = false +python-versions = ">=3.6.2" +files = [ + {file = "anyio-3.6.2-py3-none-any.whl", hash = "sha256:fbbe32bd270d2a2ef3ed1c5d45041250284e31fc0a4df4a5a6071842051a51e3"}, + {file = "anyio-3.6.2.tar.gz", hash = "sha256:25ea0d673ae30af41a0c442f81cf3b38c7e79fdc7b60335a4c14e05eb0947421"}, +] + +[package.dependencies] +idna = ">=2.8" +sniffio = ">=1.1" + +[package.extras] +doc = ["packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["contextlib2", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (<0.15)", "uvloop (>=0.15)"] +trio = ["trio (>=0.16,<0.22)"] [[package]] name = "attrs" @@ -398,6 +419,64 @@ colorama = ">=0.4" [package.extras] async = ["aiofiles (>=0.7,<1.0)"] +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[[package]] +name = "httpcore" +version = "0.17.0" +description = "A minimal low-level HTTP client." +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "httpcore-0.17.0-py3-none-any.whl", hash = "sha256:0fdfea45e94f0c9fd96eab9286077f9ff788dd186635ae61b312693e4d943599"}, + {file = "httpcore-0.17.0.tar.gz", hash = "sha256:cc045a3241afbf60ce056202301b4d8b6af08845e3294055eb26b09913ef903c"}, +] + +[package.dependencies] +anyio = ">=3.0,<5.0" +certifi = "*" +h11 = ">=0.13,<0.15" +sniffio = ">=1.0.0,<2.0.0" + +[package.extras] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (>=1.0.0,<2.0.0)"] + +[[package]] +name = "httpx" +version = "0.24.0" +description = "The next generation HTTP client." +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "httpx-0.24.0-py3-none-any.whl", hash = "sha256:447556b50c1921c351ea54b4fe79d91b724ed2b027462ab9a329465d147d5a4e"}, + {file = "httpx-0.24.0.tar.gz", hash = "sha256:507d676fc3e26110d41df7d35ebd8b3b8585052450f4097401c9be59d928c63e"}, +] + +[package.dependencies] +certifi = "*" +httpcore = ">=0.15.0,<0.18.0" +idna = "*" +sniffio = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (>=1.0.0,<2.0.0)"] + [[package]] name = "idna" version = "3.4" @@ -1232,6 +1311,18 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "sniffio" +version = "1.3.0" +description = "Sniff out which async library your code is running under" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, + {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, +] + [[package]] name = "tomli" version = "2.0.1" @@ -1367,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "cc9babcdfdc3679a4d84f68912408a005619a576947b059146ed1b428850ece9" +content-hash = "aa15e57300668bd23c051b4cd87bec4c1a58dcccd2f2b4767579fea7f2c5fa41" diff --git a/pyproject.toml b/pyproject.toml index 798fcaf..362899b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ mkdocs = "^1.4.2" mkdocstrings = {extras = ["python"], version = "^0.20.0"} mkdocs-material = "^9.1.4" pytest = "^7.2.2" +httpx = "^0.24.0" [build-system] requires = [ diff --git a/tests/test_llama.py b/tests/test_llama.py index 4727d90..9110286 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -128,3 +128,24 @@ def test_utf8(monkeypatch): n = 0 # reset completion = llama.create_completion("", max_tokens=1) assert completion["choices"][0]["text"] == "" + + +def test_llama_server(): + from fastapi.testclient import TestClient + import os + os.environ["MODEL"] = MODEL + os.environ["VOCAB_ONLY"] = "true" + from llama_cpp.server.app import app + client = TestClient(app) + response = client.get("/v1/models") + assert response.json() == { + "object": "list", + "data": [ + { + "id": MODEL, + "object": "model", + "owned_by": "me", + "permissions": [], + } + ], + } From efe8e6f8795eb2f92db22b841a40ad41fb053fe1 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Fri, 28 Apr 2023 23:47:36 -0700 Subject: [PATCH 089/443] llama_cpp server: slight refactor to init_llama function Define an init_llama function that starts llama with supplied settings instead of just doing it in the global context of app.py This allows the test to be less brittle by not needing to mess with os.environ, then importing the app --- llama_cpp/server/__main__.py | 3 ++- llama_cpp/server/app.py | 45 +++++++++++++++++++----------------- tests/test_llama.py | 9 ++++---- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index dd4767f..f57d68c 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -24,9 +24,10 @@ Then visit http://localhost:8000/docs to see the interactive API docs. import os import uvicorn -from llama_cpp.server.app import app +from llama_cpp.server.app import app, init_llama if __name__ == "__main__": + init_llama() uvicorn.run( app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 2c50fcb..92b023c 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -13,7 +13,7 @@ from sse_starlette.sse import EventSourceResponse class Settings(BaseSettings): - model: str = os.environ["MODEL"] + model: str = os.environ.get("MODEL", "null") n_ctx: int = 2048 n_batch: int = 512 n_threads: int = max((os.cpu_count() or 2) // 2, 1) @@ -38,31 +38,34 @@ app.add_middleware( allow_methods=["*"], allow_headers=["*"], ) -settings = Settings() -llama = llama_cpp.Llama( - settings.model, - f16_kv=settings.f16_kv, - use_mlock=settings.use_mlock, - use_mmap=settings.use_mmap, - embedding=settings.embedding, - logits_all=settings.logits_all, - n_threads=settings.n_threads, - n_batch=settings.n_batch, - n_ctx=settings.n_ctx, - last_n_tokens_size=settings.last_n_tokens_size, - vocab_only=settings.vocab_only, -) -if settings.cache: - cache = llama_cpp.LlamaCache() - llama.set_cache(cache) + +llama: llama_cpp.Llama = None +def init_llama(settings: Settings = None): + if settings is None: + settings = Settings() + global llama + llama = llama_cpp.Llama( + settings.model, + f16_kv=settings.f16_kv, + use_mlock=settings.use_mlock, + use_mmap=settings.use_mmap, + embedding=settings.embedding, + logits_all=settings.logits_all, + n_threads=settings.n_threads, + n_batch=settings.n_batch, + n_ctx=settings.n_ctx, + last_n_tokens_size=settings.last_n_tokens_size, + vocab_only=settings.vocab_only, + ) + if settings.cache: + cache = llama_cpp.LlamaCache() + llama.set_cache(cache) + llama_lock = Lock() - - def get_llama(): with llama_lock: yield llama - class CreateCompletionRequest(BaseModel): prompt: Union[str, List[str]] suffix: Optional[str] = Field(None) diff --git a/tests/test_llama.py b/tests/test_llama.py index 9110286..c3f69cc 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -132,10 +132,11 @@ def test_utf8(monkeypatch): def test_llama_server(): from fastapi.testclient import TestClient - import os - os.environ["MODEL"] = MODEL - os.environ["VOCAB_ONLY"] = "true" - from llama_cpp.server.app import app + from llama_cpp.server.app import app, init_llama, Settings + s = Settings() + s.model = MODEL + s.vocab_only = True + init_llama(s) client = TestClient(app) response = client.get("/v1/models") assert response.json() == { From 80184a286c55380468beea22deef95b8114ae558 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 10:44:28 -0400 Subject: [PATCH 090/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 230 ++++++++++++++++++++++++++++++++++++++--- vendor/llama.cpp | 2 +- 2 files changed, 216 insertions(+), 16 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index c081935..7b79feb 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -67,6 +67,12 @@ _lib_base_name = "llama" _lib = _load_shared_library(_lib_base_name) # C types +LLAMA_FILE_VERSION = ctypes.c_int(1) +LLAMA_FILE_MAGIC = b"ggjt" +LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" +LLAMA_SESSION_MAGIC = b"ggsn" +LLAMA_SESSION_VERSION = ctypes.c_int(0) + llama_context_p = c_void_p @@ -77,13 +83,24 @@ llama_token_p = POINTER(llama_token) class llama_token_data(Structure): _fields_ = [ ("id", llama_token), # token id + ("logit", c_float), # log-odds of the token ("p", c_float), # probability of the token - ("plog", c_float), # log probability of the token ] llama_token_data_p = POINTER(llama_token_data) + +class llama_token_data_array(Structure): + _fields_ = [ + ("data", llama_token_data_p), + ("size", c_size_t), + ("sorted", c_bool), + ] + + +llama_token_data_array_p = POINTER(llama_token_data_array) + llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) @@ -118,7 +135,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( 4 ) # tok_embeddings.weight and output.weight are F16 LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors -LLAMA_FTYPE_MOSTYL_Q4_3 = ctypes.c_int(6) # except 1d tensors +# LLAMA_FTYPE_MOSTYL_Q4_3 = ctypes.c_int(6) # except 1d tensors LLAMA_FTYPE_MOSTYL_Q8_0 = ctypes.c_int(7) # except 1d tensors LLAMA_FTYPE_MOSTYL_Q5_0 = ctypes.c_int(8) # except 1d tensors LLAMA_FTYPE_MOSTYL_Q5_1 = ctypes.c_int(9) # except 1d tensors @@ -401,31 +418,214 @@ _lib.llama_token_eos.argtypes = [] _lib.llama_token_eos.restype = llama_token -# TODO: improve the last_n_tokens interface ? -def llama_sample_top_p_top_k( +def llama_token_nl() -> llama_token: + return _lib.llama_token_nl() + + +_lib.llama_token_nl.argtypes = [] +_lib.llama_token_nl.restype = llama_token + + +# Sampling functions +def llama_sample_repetition_penalty( ctx: llama_context_p, - last_n_tokens_data, # type: Array[llama_token] - last_n_tokens_size: c_int, - top_k: c_int, - top_p: c_float, - temp: c_float, - repeat_penalty: c_float, + candidates, + last_tokens_data, + last_tokens_size: c_int, + penalty: c_float, ) -> llama_token: - return _lib.llama_sample_top_p_top_k( - ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty + return _lib.llama_sample_repetition_penalty( + ctx, candidates, last_tokens_data, last_tokens_size, penalty ) -_lib.llama_sample_top_p_top_k.argtypes = [ +_lib.llama_sample_repetition_penalty.argtypes = [ llama_context_p, + llama_token_data_array_p, llama_token_p, c_int, - c_int, c_float, +] +_lib.llama_sample_repetition_penalty.restype = llama_token + + +# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); +def llama_sample_frequency_and_presence_penalties( + ctx: llama_context_p, + candidates, + last_tokens_data, + last_tokens_size: c_int, + alpha_frequency: c_float, + alpha_presence: c_float, +) -> llama_token: + return _lib.llama_sample_frequency_and_presence_penalties( + ctx, + candidates, + last_tokens_data, + last_tokens_size, + alpha_frequency, + alpha_presence, + ) + + +_lib.llama_sample_frequency_and_presence_penalties.argtypes = [ + llama_context_p, + llama_token_data_array_p, + llama_token_p, + c_int, c_float, c_float, ] -_lib.llama_sample_top_p_top_k.restype = llama_token +_lib.llama_sample_frequency_and_presence_penalties.restype = llama_token + + +# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); +def llama_sample_softmax(ctx: llama_context_p, candidates) -> llama_token: + return _lib.llama_sample_softmax(ctx, candidates) + + +_lib.llama_sample_softmax.argtypes = [ + llama_context_p, + llama_token_data_array_p, +] +_lib.llama_sample_softmax.restype = llama_token + + +# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1); +def llama_sample_top_k( + ctx: llama_context_p, candidates, k: c_int, min_keep: c_int +) -> llama_token: + return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) + + +_lib.llama_sample_top_k.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_int, + c_int, +] + + +# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1); +def llama_sample_top_p( + ctx: llama_context_p, candidates, p: c_float, min_keep: c_int +) -> llama_token: + return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) + + +_lib.llama_sample_top_p.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_int, +] +_lib.llama_sample_top_p.restype = llama_token + + +# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1); +def llama_sample_tail_free( + ctx: llama_context_p, candidates, z: c_float, min_keep: c_int +) -> llama_token: + return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) + + +_lib.llama_sample_tail_free.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_int, +] +_lib.llama_sample_tail_free.restype = llama_token + + +# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1); +def llama_sample_typical( + ctx: llama_context_p, candidates, p: c_float, min_keep: c_int +) -> llama_token: + return _lib.llama_sample_typical(ctx, candidates, p, min_keep) + + +_lib.llama_sample_typical.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_int, +] +_lib.llama_sample_typical.restype = llama_token + + +# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); +def llama_sample_temperature( + ctx: llama_context_p, candidates, temp: c_float +) -> llama_token: + return _lib.llama_sample_temperature(ctx, candidates, temp) + + +_lib.llama_sample_temperature.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, +] +_lib.llama_sample_temperature.restype = llama_token + + +# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); +def llama_sample_token_mirostat( + ctx: llama_context_p, candidates, tau: c_float, eta: c_float, m: c_int, mu +) -> llama_token: + return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) + + +_lib.llama_sample_token_mirostat.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_float, + c_int, + POINTER(c_float), +] +_lib.llama_sample_token_mirostat.restype = llama_token + + +# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); +def llama_sample_token_mirostat_v2( + ctx: llama_context_p, candidates, tau: c_float, eta: c_float, mu +) -> llama_token: + return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) + + +_lib.llama_sample_token_mirostat_v2.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_float, + POINTER(c_float), +] +_lib.llama_sample_token_mirostat_v2.restype = llama_token + + +# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); +def llama_sample_token_greedy(ctx: llama_context_p, candidates) -> llama_token: + return _lib.llama_sample_token_greedy(ctx, candidates) + + +_lib.llama_sample_token_greedy.argtypes = [ + llama_context_p, + llama_token_data_array_p, +] +_lib.llama_sample_token_greedy.restype = llama_token + + +# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); +def llama_sample_token(ctx: llama_context_p, candidates) -> llama_token: + return _lib.llama_sample_token(ctx, candidates) + + +_lib.llama_sample_token.argtypes = [ + llama_context_p, + llama_token_data_array_p, +] +_lib.llama_sample_token.restype = llama_token # Performance information diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7f15c5c..ea3a0ad 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7f15c5c477d9933689a9d1c40794483e350c2f19 +Subproject commit ea3a0ad6b6b5ca4693b94acd4cb32e2803f66fae From 55d6308537bfa5252dc09643e4fc16f139a0d150 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 11:39:18 -0400 Subject: [PATCH 091/443] Fix test dependencies --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4481085..e60b36a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -26,7 +26,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi httpx uvicorn pip install . -v - name: Test with pytest run: | From 7837c3fdc7674c5954c0ce75186708fb7cfaaecb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 14:02:06 -0400 Subject: [PATCH 092/443] Fix return types and import comments --- llama_cpp/llama_cpp.py | 72 ++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 7b79feb..95e9cfa 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -427,13 +427,16 @@ _lib.llama_token_nl.restype = llama_token # Sampling functions + + +# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, candidates, last_tokens_data, last_tokens_size: c_int, penalty: c_float, -) -> llama_token: +): return _lib.llama_sample_repetition_penalty( ctx, candidates, last_tokens_data, last_tokens_size, penalty ) @@ -446,10 +449,10 @@ _lib.llama_sample_repetition_penalty.argtypes = [ c_int, c_float, ] -_lib.llama_sample_repetition_penalty.restype = llama_token +_lib.llama_sample_repetition_penalty.restype = None -# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); +# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, candidates, @@ -457,7 +460,7 @@ def llama_sample_frequency_and_presence_penalties( last_tokens_size: c_int, alpha_frequency: c_float, alpha_presence: c_float, -) -> llama_token: +): return _lib.llama_sample_frequency_and_presence_penalties( ctx, candidates, @@ -476,11 +479,11 @@ _lib.llama_sample_frequency_and_presence_penalties.argtypes = [ c_float, c_float, ] -_lib.llama_sample_frequency_and_presence_penalties.restype = llama_token +_lib.llama_sample_frequency_and_presence_penalties.restype = None -# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); -def llama_sample_softmax(ctx: llama_context_p, candidates) -> llama_token: +# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. +def llama_sample_softmax(ctx: llama_context_p, candidates): return _lib.llama_sample_softmax(ctx, candidates) @@ -488,13 +491,11 @@ _lib.llama_sample_softmax.argtypes = [ llama_context_p, llama_token_data_array_p, ] -_lib.llama_sample_softmax.restype = llama_token +_lib.llama_sample_softmax.restype = None -# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1); -def llama_sample_top_k( - ctx: llama_context_p, candidates, k: c_int, min_keep: c_int -) -> llama_token: +# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +def llama_sample_top_k(ctx: llama_context_p, candidates, k: c_int, min_keep: c_int): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -504,12 +505,11 @@ _lib.llama_sample_top_k.argtypes = [ c_int, c_int, ] +_lib.llama_sample_top_k.restype = None -# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1); -def llama_sample_top_p( - ctx: llama_context_p, candidates, p: c_float, min_keep: c_int -) -> llama_token: +# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +def llama_sample_top_p(ctx: llama_context_p, candidates, p: c_float, min_keep: c_int): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -519,13 +519,13 @@ _lib.llama_sample_top_p.argtypes = [ c_float, c_int, ] -_lib.llama_sample_top_p.restype = llama_token +_lib.llama_sample_top_p.restype = None -# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1); +# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( ctx: llama_context_p, candidates, z: c_float, min_keep: c_int -) -> llama_token: +): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -535,13 +535,11 @@ _lib.llama_sample_tail_free.argtypes = [ c_float, c_int, ] -_lib.llama_sample_tail_free.restype = llama_token +_lib.llama_sample_tail_free.restype = None -# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1); -def llama_sample_typical( - ctx: llama_context_p, candidates, p: c_float, min_keep: c_int -) -> llama_token: +# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. +def llama_sample_typical(ctx: llama_context_p, candidates, p: c_float, min_keep: c_int): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) @@ -551,13 +549,10 @@ _lib.llama_sample_typical.argtypes = [ c_float, c_int, ] -_lib.llama_sample_typical.restype = llama_token +_lib.llama_sample_typical.restype = None -# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); -def llama_sample_temperature( - ctx: llama_context_p, candidates, temp: c_float -) -> llama_token: +def llama_sample_temperature(ctx: llama_context_p, candidates, temp: c_float): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -566,10 +561,15 @@ _lib.llama_sample_temperature.argtypes = [ llama_token_data_array_p, c_float, ] -_lib.llama_sample_temperature.restype = llama_token +_lib.llama_sample_temperature.restype = None -# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); +# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( ctx: llama_context_p, candidates, tau: c_float, eta: c_float, m: c_int, mu ) -> llama_token: @@ -587,7 +587,11 @@ _lib.llama_sample_token_mirostat.argtypes = [ _lib.llama_sample_token_mirostat.restype = llama_token -# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); +# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( ctx: llama_context_p, candidates, tau: c_float, eta: c_float, mu ) -> llama_token: @@ -604,7 +608,7 @@ _lib.llama_sample_token_mirostat_v2.argtypes = [ _lib.llama_sample_token_mirostat_v2.restype = llama_token -# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); +# @details Selects the token with the highest probability. def llama_sample_token_greedy(ctx: llama_context_p, candidates) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -616,7 +620,7 @@ _lib.llama_sample_token_greedy.argtypes = [ _lib.llama_sample_token_greedy.restype = llama_token -# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); +# @details Randomly selects a token from the candidates based on their probabilities. def llama_sample_token(ctx: llama_context_p, candidates) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From 350a1769e188629c484bb127415cd82751a86597 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 14:47:55 -0400 Subject: [PATCH 093/443] Update sampling api --- llama_cpp/llama.py | 119 ++++++++++++++++++++++++++++++++++------- llama_cpp/llama_cpp.py | 22 +++++--- 2 files changed, 113 insertions(+), 28 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4e3c3aa..b38f2bb 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -127,7 +127,9 @@ class Llama: self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx) - self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx) + self.eval_logits: Deque[List[llama_cpp.c_float]] = deque( + maxlen=n_ctx if logits_all else 1 + ) self.cache: Optional[LlamaCache] = None @@ -236,17 +238,90 @@ class Llama: ) if int(return_code) != 0: raise RuntimeError(f"llama_eval returned {return_code}") + # Save tokens self.eval_tokens.extend(batch) - if self.params.logits_all: - n_vocab = llama_cpp.llama_n_vocab(self.ctx) - cols = int(n_vocab) - rows = n_tokens - logits_view = llama_cpp.llama_get_logits(self.ctx) - logits = [ - [logits_view[i * cols + j] for j in range(cols)] - for i in range(rows) - ] - self.eval_logits.extend(logits) + # Save logits + rows = n_tokens if self.params.logits_all else 1 + n_vocab = llama_cpp.llama_n_vocab(self.ctx) + cols = int(n_vocab) + logits_view = llama_cpp.llama_get_logits(self.ctx) + logits: List[List[llama_cpp.c_float]] = [ + [logits_view[i * cols + j] for j in range(cols)] for i in range(rows) + ] + self.eval_logits.extend(logits) + + def _sample_top_p_top_k( + self, + last_n_tokens_data, # type: llama_cpp.Array[llama_cpp.llama_token] + last_n_tokens_size: llama_cpp.c_int, + top_k: llama_cpp.c_int, + top_p: llama_cpp.c_float, + temp: llama_cpp.c_float, + repeat_penalty: llama_cpp.c_float, + ): + assert self.ctx is not None + assert len(self.eval_logits) > 0 + n_vocab = int(llama_cpp.llama_n_vocab(self.ctx)) + logits = self.eval_logits[-1] + data = (llama_cpp.llama_token_data * n_vocab)( + *[ + llama_cpp.llama_token_data( + id=llama_cpp.llama_token(i), + logit=logits[i], + p=llama_cpp.c_float(0.0), + ) + for i in range(n_vocab) + ] + ) + size = llama_cpp.c_size_t(n_vocab) + sorted = False + candidates = llama_cpp.llama_token_data_array( + data=data, + size=size, + sorted=sorted, + ) + llama_cpp.llama_sample_repetition_penalty( + ctx=self.ctx, + last_tokens_data=last_n_tokens_data, + last_tokens_size=last_n_tokens_size, + candidates=llama_cpp.ctypes.pointer(candidates), + penalty=repeat_penalty, + ) + if temp == 0.0: + return llama_cpp.llama_sample_token_greedy( + ctx=self.ctx, + candidates=llama_cpp.ctypes.pointer(candidates), + ) + else: + llama_cpp.llama_sample_top_k( + ctx=self.ctx, + candidates=llama_cpp.ctypes.pointer(candidates), + k=top_k, + ) + llama_cpp.llama_sample_tail_free( + ctx=self.ctx, + candidates=llama_cpp.ctypes.pointer(candidates), + z=llama_cpp.c_float(1.0), + ) + llama_cpp.llama_sample_typical( + ctx=self.ctx, + candidates=llama_cpp.ctypes.pointer(candidates), + p=llama_cpp.c_float(1.0) + ) + llama_cpp.llama_sample_top_p( + ctx=self.ctx, + candidates=llama_cpp.ctypes.pointer(candidates), + p=top_p, + ) + llama_cpp.llama_sample_temperature( + ctx=self.ctx, + candidates=llama_cpp.ctypes.pointer(candidates), + temp=temp, + ) + return llama_cpp.llama_sample_token( + ctx=self.ctx, + candidates=llama_cpp.ctypes.pointer(candidates), + ) def sample( self, @@ -270,8 +345,7 @@ class Llama: last_n_tokens_data = [llama_cpp.llama_token(0)] * max( 0, self.last_n_tokens_size - len(self.eval_tokens) ) + list(self.eval_tokens)[-self.last_n_tokens_size :] - return llama_cpp.llama_sample_top_p_top_k( - ctx=self.ctx, + return self._sample_top_p_top_k( last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)( *last_n_tokens_data ), @@ -470,15 +544,15 @@ class Llama: all_text = self.detokenize(completion_tokens) # Contains multi-byte UTF8 - for k,char in enumerate(all_text[-3:]): + for k, char in enumerate(all_text[-3:]): k = 3 - k - for num,pattern in [(2, 192), (3, 224), (4, 240)]: + for num, pattern in [(2, 192), (3, 224), (4, 240)]: # Bitwise AND check - if (num > k and pattern & char == pattern): + if num > k and pattern & char == pattern: multibyte_fix = num - k # Stop incomplete bytes from passing - if (multibyte_fix > 0): + if multibyte_fix > 0: multibyte_fix -= 1 continue @@ -531,7 +605,9 @@ class Llama: "model": self.model_path, "choices": [ { - "text": text[returned_characters:].decode("utf-8", errors="ignore"), + "text": text[returned_characters:].decode( + "utf-8", errors="ignore" + ), "index": 0, "logprobs": None, "finish_reason": finish_reason, @@ -558,7 +634,8 @@ class Llama: all_tokens = prompt_tokens + completion_tokens all_token_strs = [ - self.detokenize([token]).decode("utf-8", errors="ignore") for token in all_tokens + self.detokenize([token]).decode("utf-8", errors="ignore") + for token in all_tokens ] all_logprobs = [ [Llama.logit_to_logprob(logit) for logit in row] @@ -577,7 +654,9 @@ class Llama: ) token_logprobs.append(sorted_logprobs[int(token)][0]) top_logprob = { - self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8", errors="ignore"): logprob + self.detokenize([llama_cpp.llama_token(i)]).decode( + "utf-8", errors="ignore" + ): logprob for logprob, i in sorted_logprobs[:logprobs] } top_logprob.update({token_str: sorted_logprobs[int(token)][0]}) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 95e9cfa..b4717bf 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -495,7 +495,9 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 -def llama_sample_top_k(ctx: llama_context_p, candidates, k: c_int, min_keep: c_int): +def llama_sample_top_k( + ctx: llama_context_p, candidates, k: c_int, min_keep: c_size_t = c_size_t(1) +): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -503,13 +505,15 @@ _lib.llama_sample_top_k.argtypes = [ llama_context_p, llama_token_data_array_p, c_int, - c_int, + c_size_t, ] _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 -def llama_sample_top_p(ctx: llama_context_p, candidates, p: c_float, min_keep: c_int): +def llama_sample_top_p( + ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1) +): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -517,14 +521,14 @@ _lib.llama_sample_top_p.argtypes = [ llama_context_p, llama_token_data_array_p, c_float, - c_int, + c_size_t, ] _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( - ctx: llama_context_p, candidates, z: c_float, min_keep: c_int + ctx: llama_context_p, candidates, z: c_float, min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -533,13 +537,15 @@ _lib.llama_sample_tail_free.argtypes = [ llama_context_p, llama_token_data_array_p, c_float, - c_int, + c_size_t, ] _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. -def llama_sample_typical(ctx: llama_context_p, candidates, p: c_float, min_keep: c_int): +def llama_sample_typical( + ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1) +): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) @@ -547,7 +553,7 @@ _lib.llama_sample_typical.argtypes = [ llama_context_p, llama_token_data_array_p, c_float, - c_int, + c_size_t, ] _lib.llama_sample_typical.restype = None From dbe0ad86c852d72148e930366240876dfb335d72 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 14:50:01 -0400 Subject: [PATCH 094/443] Update test dependencies --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index e60b36a..671af84 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -26,7 +26,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi httpx uvicorn + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pip install . -v - name: Test with pytest run: | From 2f8a3adaa40a30531bb86c91649b0d2f6200fc61 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 15:01:49 -0400 Subject: [PATCH 095/443] Temporarily skip sampling tests. --- tests/test_llama.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_llama.py b/tests/test_llama.py index c3f69cc..fe2bd66 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -1,3 +1,4 @@ +import pytest import llama_cpp MODEL = "./vendor/llama.cpp/models/ggml-vocab.bin" @@ -14,6 +15,7 @@ def test_llama(): assert llama.detokenize(llama.tokenize(text)) == text +@pytest.mark.skip(reason="need to update sample mocking") def test_llama_patch(monkeypatch): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) @@ -95,6 +97,7 @@ def test_llama_pickle(): assert llama.detokenize(llama.tokenize(text)) == text +@pytest.mark.skip(reason="need to update sample mocking") def test_utf8(monkeypatch): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) From 9ff9cdd7fc6cc94fefae15cae97f2b38ee358d14 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 15:11:15 -0400 Subject: [PATCH 096/443] Fix import error --- llama_cpp/server/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 92b023c..640dd3f 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,8 +1,8 @@ import os import json from threading import Lock -from typing import List, Optional, Literal, Union, Iterator, Dict -from typing_extensions import TypedDict +from typing import List, Optional, Union, Iterator, Dict +from typing_extensions import TypedDict, Literal import llama_cpp From f073ef057137a96c8f6a55c439bf46fb7436c4d2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 15:23:01 -0400 Subject: [PATCH 097/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ea3a0ad..58b367c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ea3a0ad6b6b5ca4693b94acd4cb32e2803f66fae +Subproject commit 58b367c2d757c0ea12aec672382462b42204c724 From 5034bbf499d56ccfedefe932f4bf7e8c29e2e0c6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 15:23:59 -0400 Subject: [PATCH 098/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 362899b..54088b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.39" +version = "0.1.40" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 19693a1..fbd22c6 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.39", + version="0.1.40", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From bf3d0dcb2cf855c8e68e4e09b2a3a4b6d48e85b1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 15:28:46 -0400 Subject: [PATCH 099/443] Fix tests --- .github/workflows/test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 671af84..56524e0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -49,7 +49,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pip install . -v - name: Test with pytest run: | @@ -72,7 +72,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pip install . -v - name: Test with pytest run: | From c088a2b3a71d21b9c505300d9bbd37f07881ed25 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 15:46:03 -0400 Subject: [PATCH 100/443] Un-skip tests --- tests/test_llama.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/test_llama.py b/tests/test_llama.py index fe2bd66..2bf38b3 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -1,4 +1,3 @@ -import pytest import llama_cpp MODEL = "./vendor/llama.cpp/models/ggml-vocab.bin" @@ -15,15 +14,20 @@ def test_llama(): assert llama.detokenize(llama.tokenize(text)) == text -@pytest.mark.skip(reason="need to update sample mocking") +# @pytest.mark.skip(reason="need to update sample mocking") def test_llama_patch(monkeypatch): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) + n_vocab = int(llama_cpp.llama_n_vocab(llama.ctx)) ## Set up mock function def mock_eval(*args, **kwargs): return 0 + + def mock_get_logits(*args, **kwargs): + return (llama_cpp.c_float * n_vocab)(*[llama_cpp.c_float(0) for _ in range(n_vocab)]) monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) + monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) output_text = " jumps over the lazy dog." output_tokens = llama.tokenize(output_text.encode("utf-8")) @@ -38,7 +42,7 @@ def test_llama_patch(monkeypatch): else: return token_eos - monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_top_p_top_k", mock_sample) + monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample) text = "The quick brown fox" @@ -97,15 +101,19 @@ def test_llama_pickle(): assert llama.detokenize(llama.tokenize(text)) == text -@pytest.mark.skip(reason="need to update sample mocking") def test_utf8(monkeypatch): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) + n_vocab = int(llama_cpp.llama_n_vocab(llama.ctx)) ## Set up mock function def mock_eval(*args, **kwargs): return 0 + def mock_get_logits(*args, **kwargs): + return (llama_cpp.c_float * n_vocab)(*[llama_cpp.c_float(0) for _ in range(n_vocab)]) + monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) + monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) output_text = "😀" output_tokens = llama.tokenize(output_text.encode("utf-8")) @@ -120,7 +128,7 @@ def test_utf8(monkeypatch): else: return token_eos - monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_top_p_top_k", mock_sample) + monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample) ## Test basic completion with utf8 multibyte n = 0 # reset From b6747f722e473cb8380a3b8145704e18d8fc76b8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 17:45:08 -0400 Subject: [PATCH 101/443] Fix logprob calculation. Fixes #134 --- llama_cpp/llama.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index b38f2bb..bec5be7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -638,7 +638,7 @@ class Llama: for token in all_tokens ] all_logprobs = [ - [Llama.logit_to_logprob(logit) for logit in row] + Llama._logits_to_logprobs(row) for row in self.eval_logits ] for token, token_str, logprobs_token in zip( @@ -980,5 +980,7 @@ class Llama: return llama_cpp.llama_token_bos() @staticmethod - def logit_to_logprob(x: float) -> float: - return math.log(1.0 + math.exp(x)) + def logits_to_logprobs(logits: List[llama_cpp.c_float]) -> List[llama_cpp.c_float]: + exps = [math.exp(float(x)) for x in logits] + sum_exps = sum(exps) + return [llama_cpp.c_float(math.log(x / sum_exps)) for x in exps] From 53c0129eb6aac3fad21b08a0f9bb05156834fcee Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 18:07:15 -0400 Subject: [PATCH 102/443] Update submoduele clone instructions --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 906d055..80518f6 100644 --- a/README.md +++ b/README.md @@ -90,8 +90,7 @@ This package is under active development and I welcome any contributions. To get started, clone the repository and install the package in development mode: ```bash -git clone git@github.com:abetlen/llama-cpp-python.git -git submodule update --init --recursive +git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git # Will need to be re-run any time vendor/llama.cpp is updated python3 setup.py develop ``` From 9d60ae56f28c169337bc4aaf4090e28e78f411de Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 18:07:45 -0400 Subject: [PATCH 103/443] Fix whitespace --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 80518f6..9ee9199 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ This package is under active development and I welcome any contributions. To get started, clone the repository and install the package in development mode: ```bash -git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git +git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git # Will need to be re-run any time vendor/llama.cpp is updated python3 setup.py develop ``` From e40fcb05754d0ec9c65359e245a436794cbfefdb Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 00:47:35 -0700 Subject: [PATCH 104/443] llama_cpp server: mark model as required `model` is ignored, but currently marked "optional"... on the one hand could mark "required" to make it explicit in case the server supports multiple llama's at the same time, but also could delete it since its ignored. decision: mark it required for the sake of openai api compatibility. I think out of all parameters, `model` is probably the most important one for people to keep using even if its ignored for now. --- llama_cpp/server/app.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 640dd3f..5d87e78 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -66,6 +66,10 @@ def get_llama(): with llama_lock: yield llama +model_field = Field( + description="The model to use for generating completions." +) + class CreateCompletionRequest(BaseModel): prompt: Union[str, List[str]] suffix: Optional[str] = Field(None) @@ -76,8 +80,9 @@ class CreateCompletionRequest(BaseModel): stop: Optional[List[str]] = [] stream: bool = False - # ignored or currently unsupported - model: Optional[str] = Field(None) + # ignored, but marked as required for the sake of compatibility with openai's api + model: str = model_field + n: Optional[int] = 1 logprobs: Optional[int] = Field(None) presence_penalty: Optional[float] = 0 @@ -133,7 +138,8 @@ def create_completion( class CreateEmbeddingRequest(BaseModel): - model: Optional[str] + # ignored, but marked as required for the sake of compatibility with openai's api + model: str = model_field input: str user: Optional[str] @@ -173,8 +179,9 @@ class CreateChatCompletionRequest(BaseModel): stop: Optional[List[str]] = [] max_tokens: int = 128 - # ignored or currently unsupported - model: Optional[str] = Field(None) + # ignored, but marked as required for the sake of compatibility with openai's api + model: str = model_field + n: Optional[int] = 1 presence_penalty: Optional[float] = 0 frequency_penalty: Optional[float] = 0 From b47b9549d57f146a00ee19cd7d2bb294111abb67 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 01:19:30 -0700 Subject: [PATCH 105/443] llama_cpp server: delete some ignored / unused parameters `n`, `presence_penalty`, `frequency_penalty`, `best_of`, `logit_bias`, `user`: not supported, excluded from the calls into llama. decision: delete it --- llama_cpp/llama_types.py | 2 -- llama_cpp/server/app.py | 30 +++--------------------------- 2 files changed, 3 insertions(+), 29 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index b62ff1b..b8bdb08 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -60,8 +60,6 @@ class Completion(TypedDict): class ChatCompletionMessage(TypedDict): role: Union[Literal["assistant"], Literal["user"], Literal["system"]] content: str - user: NotRequired[str] - class ChatCompletionChoice(TypedDict): index: int diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 5d87e78..cc467db 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -83,13 +83,7 @@ class CreateCompletionRequest(BaseModel): # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field - n: Optional[int] = 1 logprobs: Optional[int] = Field(None) - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - best_of: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) # llama.cpp specific parameters top_k: int = 40 @@ -120,13 +114,7 @@ def create_completion( completion_or_chunks = llama( **request.dict( exclude={ - "model", - "n", - "frequency_penalty", - "presence_penalty", - "best_of", - "logit_bias", - "user", + "model" } ) ) @@ -141,7 +129,6 @@ class CreateEmbeddingRequest(BaseModel): # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field input: str - user: Optional[str] class Config: schema_extra = { @@ -161,7 +148,7 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - return llama.create_embedding(**request.dict(exclude={"model", "user"})) + return llama.create_embedding(**request.dict(exclude={"model"})) class ChatCompletionRequestMessage(BaseModel): @@ -181,12 +168,6 @@ class CreateChatCompletionRequest(BaseModel): # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field - - n: Optional[int] = 1 - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) # llama.cpp specific parameters repeat_penalty: float = 1.1 @@ -220,12 +201,7 @@ def create_chat_completion( completion_or_chunks = llama.create_chat_completion( **request.dict( exclude={ - "model", - "n", - "presence_penalty", - "frequency_penalty", - "logit_bias", - "user", + "model" } ), ) From 1e429135993f4e1298d8c801f2628bae3d8f18a9 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 11:48:49 -0700 Subject: [PATCH 106/443] llama_cpp server: move logprobs to supported I think this is actually supported (its in the arguments of `LLama.__call__`, which is how the completion is invoked). decision: mark as supported --- llama_cpp/server/app.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index cc467db..2d20f37 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -79,12 +79,11 @@ class CreateCompletionRequest(BaseModel): echo: bool = False stop: Optional[List[str]] = [] stream: bool = False + logprobs: Optional[int] = Field(None) # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field - logprobs: Optional[int] = Field(None) - # llama.cpp specific parameters top_k: int = 40 repeat_penalty: float = 1.1 From a5aa6c1478de7cc16b654df533be3dee6519c42a Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 11:52:20 -0700 Subject: [PATCH 107/443] llama_cpp server: add missing top_k param to CreateChatCompletionRequest `llama.create_chat_completion` definitely has a `top_k` argument, but its missing from `CreateChatCompletionRequest`. decision: add it --- llama_cpp/server/app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 2d20f37..e1045af 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -169,6 +169,7 @@ class CreateChatCompletionRequest(BaseModel): model: str = model_field # llama.cpp specific parameters + top_k: int = 40, repeat_penalty: float = 1.1 class Config: From 978b6daf9313a11367d0a9393226379173fdb688 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 14:37:36 -0700 Subject: [PATCH 108/443] llama_cpp server: add some more information to fields for completions --- llama_cpp/server/app.py | 70 ++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index e1045af..e168485 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -71,22 +71,70 @@ model_field = Field( ) class CreateCompletionRequest(BaseModel): - prompt: Union[str, List[str]] - suffix: Optional[str] = Field(None) - max_tokens: int = 16 - temperature: float = 0.8 - top_p: float = 0.95 - echo: bool = False - stop: Optional[List[str]] = [] - stream: bool = False - logprobs: Optional[int] = Field(None) + prompt: Union[str, List[str]] = Field( + default="", + description="The prompt to generate completions for." + ) + suffix: Optional[str] = Field( + default=None, + description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots." + ) + max_tokens: int = Field( + default=16, + ge=1, + le=2048, + description="The maximum number of tokens to generate." + ) + temperature: float = Field( + default=0.8, + ge=0.0, + le=2.0, + description="Adjust the randomness of the generated text.\n\n" + + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run." + ) + top_p: float = Field( + default=0.95, + ge=0.0, + le=1.0, + description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text." + ) + echo: bool = Field( + default=False, + description="Whether to echo the prompt in the generated text. Useful for chatbots." + ) + stop: Optional[List[str]] = Field( + default=None, + description="A list of tokens at which to stop generation. If None, no stop tokens are used." + ) + stream: bool = Field( + default=False, + description="Whether to stream the results as they are generated. Useful for chatbots." + ) + logprobs: Optional[int] = Field( + default=None, + ge=0, + description="The number of logprobs to generate. If None, no logprobs are generated." + ) + + # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field # llama.cpp specific parameters - top_k: int = 40 - repeat_penalty: float = 1.1 + top_k: int = Field( + default=40, + ge=0, + description="Limit the next token selection to the K most probable tokens.\n\n" + + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text." + ) + repeat_penalty: float = Field( + default=1.0, + ge=0.0, + description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient." + ) class Config: schema_extra = { From 8dcbf65a45d729eedb4363f4e92247e6325d5b7d Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 18:37:43 -0700 Subject: [PATCH 109/443] llama_cpp server: define fields for chat completions Slight refactor for common fields shared between completion and chat completion --- llama_cpp/server/app.py | 125 +++++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 54 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index e168485..ec5dbd3 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -70,6 +70,55 @@ model_field = Field( description="The model to use for generating completions." ) +max_tokens_field = Field( + default=16, + ge=1, + le=2048, + description="The maximum number of tokens to generate." +) + +temperature_field = Field( + default=0.8, + ge=0.0, + le=2.0, + description="Adjust the randomness of the generated text.\n\n" + + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run." +) + +top_p_field = Field( + default=0.95, + ge=0.0, + le=1.0, + description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text." +) + +stop_field = Field( + default=None, + description="A list of tokens at which to stop generation. If None, no stop tokens are used." +) + +stream_field = Field( + default=False, + description="Whether to stream the results as they are generated. Useful for chatbots." +) + +top_k_field = Field( + default=40, + ge=0, + description="Limit the next token selection to the K most probable tokens.\n\n" + + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text." +) + +repeat_penalty_field = Field( + default=1.0, + ge=0.0, + description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient." +) + + + class CreateCompletionRequest(BaseModel): prompt: Union[str, List[str]] = Field( default="", @@ -79,62 +128,27 @@ class CreateCompletionRequest(BaseModel): default=None, description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots." ) - max_tokens: int = Field( - default=16, - ge=1, - le=2048, - description="The maximum number of tokens to generate." - ) - temperature: float = Field( - default=0.8, - ge=0.0, - le=2.0, - description="Adjust the randomness of the generated text.\n\n" + - "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run." - ) - top_p: float = Field( - default=0.95, - ge=0.0, - le=1.0, - description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + - "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text." - ) + max_tokens: int = max_tokens_field + temperature: float = temperature_field + top_p: float = top_p_field echo: bool = Field( default=False, description="Whether to echo the prompt in the generated text. Useful for chatbots." ) - stop: Optional[List[str]] = Field( - default=None, - description="A list of tokens at which to stop generation. If None, no stop tokens are used." - ) - stream: bool = Field( - default=False, - description="Whether to stream the results as they are generated. Useful for chatbots." - ) + stop: Optional[List[str]] = stop_field + stream: bool = stream_field logprobs: Optional[int] = Field( default=None, ge=0, description="The number of logprobs to generate. If None, no logprobs are generated." ) - - # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field # llama.cpp specific parameters - top_k: int = Field( - default=40, - ge=0, - description="Limit the next token selection to the K most probable tokens.\n\n" + - "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text." - ) - repeat_penalty: float = Field( - default=1.0, - ge=0.0, - description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + - "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient." - ) + top_k: int = top_k_field + repeat_penalty: float = repeat_penalty_field class Config: schema_extra = { @@ -199,26 +213,29 @@ def create_embedding( class ChatCompletionRequestMessage(BaseModel): - role: Union[Literal["system"], Literal["user"], Literal["assistant"]] - content: str - user: Optional[str] = None + role: Union[Literal["system"], Literal["user"], Literal["assistant"]] = Field( + default=Literal["user"], description="The role of the message." + ) + content: str = Field(default="", description="The content of the message.") class CreateChatCompletionRequest(BaseModel): - model: Optional[str] - messages: List[ChatCompletionRequestMessage] - temperature: float = 0.8 - top_p: float = 0.95 - stream: bool = False - stop: Optional[List[str]] = [] - max_tokens: int = 128 + messages: List[ChatCompletionRequestMessage] = Field( + default=[], + description="A list of messages to generate completions for." + ) + max_tokens: int = max_tokens_field + temperature: float = temperature_field + top_p: float = top_p_field + stop: Optional[List[str]] = stop_field + stream: bool = stream_field # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field # llama.cpp specific parameters - top_k: int = 40, - repeat_penalty: float = 1.1 + top_k: int = top_k_field + repeat_penalty: float = repeat_penalty_field class Config: schema_extra = { From fa2a61e06569bb600d36d7ea5fee2ab456b3434d Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 18:46:01 -0700 Subject: [PATCH 110/443] llama_cpp server: fields for the embedding endpoint --- llama_cpp/server/app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ec5dbd3..9adddcd 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -189,7 +189,9 @@ def create_completion( class CreateEmbeddingRequest(BaseModel): # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field - input: str + input: str = Field( + description="The input to embed." + ) class Config: schema_extra = { From dbbfc4ba2f8460e130dc268096f5906d3d22347b Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Mon, 1 May 2023 11:48:37 -0700 Subject: [PATCH 111/443] llama_cpp server: fix to ChatCompletionRequestMessage When I generate a client, it breaks because it fails to process the schema of ChatCompletionRequestMessage These fix that: - I think `Union[Literal["user"], Literal["channel"], ...]` is the same as Literal["user", "channel", ...] - Turns out default value `Literal["user"]` isn't JSON serializable, so replace with "user" --- llama_cpp/llama_types.py | 2 +- llama_cpp/server/app.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index b8bdb08..b770a01 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -58,7 +58,7 @@ class Completion(TypedDict): class ChatCompletionMessage(TypedDict): - role: Union[Literal["assistant"], Literal["user"], Literal["system"]] + role: Literal["assistant", "user", "system"] content: str class ChatCompletionChoice(TypedDict): diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 9adddcd..886ee6d 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -215,8 +215,8 @@ def create_embedding( class ChatCompletionRequestMessage(BaseModel): - role: Union[Literal["system"], Literal["user"], Literal["assistant"]] = Field( - default=Literal["user"], description="The role of the message." + role: Literal["system", "user", "assistant"] = Field( + default="user", description="The role of the message." ) content: str = Field(default="", description="The content of the message.") From dd9ad1c759d2d407502a4c03758c76659d83ab59 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 21:51:16 -0400 Subject: [PATCH 112/443] Formatting --- llama_cpp/llama.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index bec5be7..d201013 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -306,7 +306,7 @@ class Llama: llama_cpp.llama_sample_typical( ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), - p=llama_cpp.c_float(1.0) + p=llama_cpp.c_float(1.0), ) llama_cpp.llama_sample_top_p( ctx=self.ctx, @@ -637,10 +637,7 @@ class Llama: self.detokenize([token]).decode("utf-8", errors="ignore") for token in all_tokens ] - all_logprobs = [ - Llama._logits_to_logprobs(row) - for row in self.eval_logits - ] + all_logprobs = [Llama._logits_to_logprobs(row) for row in self.eval_logits] for token, token_str, logprobs_token in zip( all_tokens, all_token_strs, all_logprobs ): From 9eafc4c49aa4d1dbd3cf58c73c753382a821800f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 22:38:46 -0400 Subject: [PATCH 113/443] Refactor server to use factory --- llama_cpp/server/__main__.py | 4 +-- llama_cpp/server/app.py | 51 +++++++++++++++++++++--------------- tests/test_llama.py | 23 ++++++++++------ 3 files changed, 47 insertions(+), 31 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index f57d68c..4fbee37 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -24,10 +24,10 @@ Then visit http://localhost:8000/docs to see the interactive API docs. import os import uvicorn -from llama_cpp.server.app import app, init_llama +from llama_cpp.server.app import create_app if __name__ == "__main__": - init_llama() + app = create_app() uvicorn.run( app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 640dd3f..8e86088 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -2,18 +2,18 @@ import os import json from threading import Lock from typing import List, Optional, Union, Iterator, Dict -from typing_extensions import TypedDict, Literal +from typing_extensions import TypedDict, Literal, Annotated import llama_cpp -from fastapi import Depends, FastAPI +from fastapi import Depends, FastAPI, APIRouter from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict from sse_starlette.sse import EventSourceResponse class Settings(BaseSettings): - model: str = os.environ.get("MODEL", "null") + model: str n_ctx: int = 2048 n_batch: int = 512 n_threads: int = max((os.cpu_count() or 2) // 2, 1) @@ -27,25 +27,29 @@ class Settings(BaseSettings): vocab_only: bool = False -app = FastAPI( - title="🦙 llama.cpp Python API", - version="0.0.1", -) -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) +router = APIRouter() -llama: llama_cpp.Llama = None -def init_llama(settings: Settings = None): +llama: Optional[llama_cpp.Llama] = None + + +def create_app(settings: Optional[Settings] = None): if settings is None: settings = Settings() + app = FastAPI( + title="🦙 llama.cpp Python API", + version="0.0.1", + ) + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + app.include_router(router) global llama llama = llama_cpp.Llama( - settings.model, + model_path=settings.model, f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, use_mmap=settings.use_mmap, @@ -60,12 +64,17 @@ def init_llama(settings: Settings = None): if settings.cache: cache = llama_cpp.LlamaCache() llama.set_cache(cache) + return app + llama_lock = Lock() + + def get_llama(): with llama_lock: yield llama + class CreateCompletionRequest(BaseModel): prompt: Union[str, List[str]] suffix: Optional[str] = Field(None) @@ -102,7 +111,7 @@ class CreateCompletionRequest(BaseModel): CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) -@app.post( +@router.post( "/v1/completions", response_model=CreateCompletionResponse, ) @@ -148,7 +157,7 @@ class CreateEmbeddingRequest(BaseModel): CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) -@app.post( +@router.post( "/v1/embeddings", response_model=CreateEmbeddingResponse, ) @@ -202,7 +211,7 @@ class CreateChatCompletionRequest(BaseModel): CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) -@app.post( +@router.post( "/v1/chat/completions", response_model=CreateChatCompletionResponse, ) @@ -256,7 +265,7 @@ class ModelList(TypedDict): GetModelResponse = create_model_from_typeddict(ModelList) -@app.get("/v1/models", response_model=GetModelResponse) +@router.get("/v1/models", response_model=GetModelResponse) def get_models() -> ModelList: return { "object": "list", diff --git a/tests/test_llama.py b/tests/test_llama.py index 2bf38b3..3ea19e0 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -22,9 +22,11 @@ def test_llama_patch(monkeypatch): ## Set up mock function def mock_eval(*args, **kwargs): return 0 - + def mock_get_logits(*args, **kwargs): - return (llama_cpp.c_float * n_vocab)(*[llama_cpp.c_float(0) for _ in range(n_vocab)]) + return (llama_cpp.c_float * n_vocab)( + *[llama_cpp.c_float(0) for _ in range(n_vocab)] + ) monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) @@ -88,6 +90,7 @@ def test_llama_patch(monkeypatch): def test_llama_pickle(): import pickle import tempfile + fp = tempfile.TemporaryFile() llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) pickle.dump(llama, fp) @@ -101,6 +104,7 @@ def test_llama_pickle(): assert llama.detokenize(llama.tokenize(text)) == text + def test_utf8(monkeypatch): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) n_vocab = int(llama_cpp.llama_n_vocab(llama.ctx)) @@ -110,7 +114,9 @@ def test_utf8(monkeypatch): return 0 def mock_get_logits(*args, **kwargs): - return (llama_cpp.c_float * n_vocab)(*[llama_cpp.c_float(0) for _ in range(n_vocab)]) + return (llama_cpp.c_float * n_vocab)( + *[llama_cpp.c_float(0) for _ in range(n_vocab)] + ) monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) @@ -143,11 +149,12 @@ def test_utf8(monkeypatch): def test_llama_server(): from fastapi.testclient import TestClient - from llama_cpp.server.app import app, init_llama, Settings - s = Settings() - s.model = MODEL - s.vocab_only = True - init_llama(s) + from llama_cpp.server.app import create_app, Settings + + settings = Settings() + settings.model = MODEL + settings.vocab_only = True + app = create_app(settings) client = TestClient(app) response = client.get("/v1/models") assert response.json() == { From 46e3c4b84a66efff60dd882173c54acc56e06356 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 22:41:54 -0400 Subject: [PATCH 114/443] Fix --- tests/test_llama.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_llama.py b/tests/test_llama.py index 3ea19e0..b3426b8 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -151,9 +151,10 @@ def test_llama_server(): from fastapi.testclient import TestClient from llama_cpp.server.app import create_app, Settings - settings = Settings() - settings.model = MODEL - settings.vocab_only = True + settings = Settings( + model=MODEL, + vocab_only=True, + ) app = create_app(settings) client = TestClient(app) response = client.get("/v1/models") From e9e0654aed80cd5bf8f9f23b16bb6dac76e2845a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 22:52:25 -0400 Subject: [PATCH 115/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 54088b6..64f7a0d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.40" +version = "0.1.41" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index fbd22c6..f7f0fa4 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.40", + version="0.1.41", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From f97ff3c5bbc2ac4ca5e96c793faf7f8a1d0bbc31 Mon Sep 17 00:00:00 2001 From: Matt Hoffner Date: Mon, 1 May 2023 20:40:06 -0700 Subject: [PATCH 116/443] Update llama_cpp.py --- llama_cpp/llama_cpp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index b4717bf..e0424c6 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -136,9 +136,9 @@ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( ) # tok_embeddings.weight and output.weight are F16 LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors # LLAMA_FTYPE_MOSTYL_Q4_3 = ctypes.c_int(6) # except 1d tensors -LLAMA_FTYPE_MOSTYL_Q8_0 = ctypes.c_int(7) # except 1d tensors -LLAMA_FTYPE_MOSTYL_Q5_0 = ctypes.c_int(8) # except 1d tensors -LLAMA_FTYPE_MOSTYL_Q5_1 = ctypes.c_int(9) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9) # except 1d tensors # Functions From d605408f9917943bc0c969b502335ab56b5b2d59 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 2 May 2023 00:55:34 -0400 Subject: [PATCH 117/443] Add dockerignore --- .dockerignore | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..fd64c09 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,166 @@ +_skbuild/ + +.envrc + +models/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ From 81631afc48990135c20ece1d52872a7de3033715 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 2 May 2023 00:55:51 -0400 Subject: [PATCH 118/443] Install from local directory --- Dockerfile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index ade4ac9..14fb3be 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,8 +3,13 @@ FROM python:3-bullseye # We need to set the host to 0.0.0.0 to allow outside access ENV HOST 0.0.0.0 +COPY . . + # Install the package -RUN apt update && apt install -y libopenblas-dev && LLAMA_OPENBLAS=1 pip install llama-cpp-python[server] +RUN apt update && apt install -y libopenblas-dev +RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette + +RUN LLAMA_OPENBLAS=1 python3 setup.py develop # Run the server CMD python3 -m llama_cpp.server \ No newline at end of file From 5d5421b29ddd45ea693d0ce36552c7ff40d83187 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 2 May 2023 01:04:02 -0400 Subject: [PATCH 119/443] Add build docker --- .github/workflows/build-docker.yaml | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/build-docker.yaml diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml new file mode 100644 index 0000000..9a06da5 --- /dev/null +++ b/.github/workflows/build-docker.yaml @@ -0,0 +1,33 @@ +name: Build Docker + +on: workflow_dispatch + +jobs: + docker: + name: Build and push Docker image + runs-on: ubuntu-latest + needs: build-n-publish + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v4 + with: + push: true # push to registry + pull: true # always fetch the latest base images + platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 + tags: ghcr.io/abetlen/llama-cpp-python:latest \ No newline at end of file From 36c81489e7dfb3af9c78b2f07da62f39775cd23e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 2 May 2023 01:04:36 -0400 Subject: [PATCH 120/443] Remove docker section of publish --- .github/workflows/publish.yaml | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 16a6012..ddefd68 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -29,32 +29,3 @@ jobs: uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.PYPI_API_TOKEN }} - - docker: - name: Build and push Docker image - runs-on: ubuntu-latest - needs: build-n-publish - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build and push - uses: docker/build-push-action@v4 - with: - push: true # push to registry - pull: true # always fetch the latest base images - platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 - tags: ghcr.io/abetlen/llama-cpp-python:latest \ No newline at end of file From 62de4692f227a6dc468ce2054bef09caa805b44a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 2 May 2023 01:09:27 -0400 Subject: [PATCH 121/443] Fix missing dependency --- .github/workflows/build-docker.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 9a06da5..f4290cd 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -6,7 +6,6 @@ jobs: docker: name: Build and push Docker image runs-on: ubuntu-latest - needs: build-n-publish steps: - name: Checkout uses: actions/checkout@v3 From 872b2ec33f97c69ac601df9f5c73b8ad110f6aa0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 2 May 2023 01:11:34 -0400 Subject: [PATCH 122/443] Clone submodules --- .github/workflows/build-docker.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index f4290cd..2a0c52d 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -9,6 +9,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v3 + with: + submodules: "true" - name: Set up QEMU uses: docker/setup-qemu-action@v2 From c21a34506ebca1eda2a57ef5b6e13e2151049d37 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 2 May 2023 01:13:43 -0400 Subject: [PATCH 123/443] Update permsissions --- .github/workflows/build-docker.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 2a0c52d..7265b32 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -2,6 +2,9 @@ name: Build Docker on: workflow_dispatch +permissions: + contents: write + jobs: docker: name: Build and push Docker image From 63f8d3a6fb97a1973f3a0529050b71f1d884511d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 2 May 2023 01:16:44 -0400 Subject: [PATCH 124/443] Update context --- .github/workflows/build-docker.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 7265b32..dda2afe 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -31,6 +31,7 @@ jobs: - name: Build and push uses: docker/build-push-action@v4 with: + context: . push: true # push to registry pull: true # always fetch the latest base images platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 From c2e31eeceee5380e3c6357b73105585265de68bf Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 2 May 2023 01:23:17 -0400 Subject: [PATCH 125/443] Update permissions --- .github/workflows/build-docker.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index dda2afe..16b00a2 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -4,6 +4,7 @@ on: workflow_dispatch permissions: contents: write + packages: write jobs: docker: From 0fcc25cdacc550ca5ab663239a3600b297c4a188 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Fri, 28 Apr 2023 23:54:31 -0700 Subject: [PATCH 126/443] examples fastapi_server: deprecate This commit "deprecates" the example fastapi server by remaining runnable but pointing folks at the module if they want to learn more. Rationale: Currently there exist two server implementations in this repo: - `llama_cpp/server/__main__.py`, the module that's runnable by consumers of the library with `python3 -m llama_cpp.server` - `examples/high_level_api/fastapi_server.py`, which is probably a copy-pasted example by folks hacking around IMO this is confusing. As a new user of the library I see they've both been updated relatively recently but looking side-by-side there's a diff. The one in the module seems better: - supports logits_all - supports use_mmap - has experimental cache support (with some mutex thing going on) - some stuff with streaming support was moved around more recently than fastapi_server.py --- examples/high_level_api/fastapi_server.py | 267 ++-------------------- 1 file changed, 21 insertions(+), 246 deletions(-) diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py index 3ed0eac..4b3189d 100644 --- a/examples/high_level_api/fastapi_server.py +++ b/examples/high_level_api/fastapi_server.py @@ -4,259 +4,34 @@ To run this example: ```bash pip install fastapi uvicorn sse-starlette -export MODEL=../models/7B/ggml-model.bin -uvicorn fastapi_server_chat:app --reload +export MODEL=../models/7B/... +``` + +Then run: +``` +uvicorn llama_cpp.server.app:app --reload +``` + +or + +``` +python3 -m llama_cpp.server ``` Then visit http://localhost:8000/docs to see the interactive API docs. + +To actually see the implementation of the server, see llama_cpp/server/app.py + """ import os -import json -from typing import List, Optional, Literal, Union, Iterator, Dict -from typing_extensions import TypedDict - -import llama_cpp - -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict -from sse_starlette.sse import EventSourceResponse - - -class Settings(BaseSettings): - model: str - n_ctx: int = 2048 - n_batch: int = 8 - n_threads: int = int(os.cpu_count() / 2) or 1 - f16_kv: bool = True - use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... - embedding: bool = True - last_n_tokens_size: int = 64 - - -app = FastAPI( - title="🦙 llama.cpp Python API", - version="0.0.1", -) -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) -settings = Settings() -llama = llama_cpp.Llama( - settings.model, - f16_kv=settings.f16_kv, - use_mlock=settings.use_mlock, - embedding=settings.embedding, - n_threads=settings.n_threads, - n_batch=settings.n_batch, - n_ctx=settings.n_ctx, - last_n_tokens_size=settings.last_n_tokens_size, -) - - -class CreateCompletionRequest(BaseModel): - prompt: str - suffix: Optional[str] = Field(None) - max_tokens: int = 16 - temperature: float = 0.8 - top_p: float = 0.95 - echo: bool = False - stop: List[str] = [] - stream: bool = False - - # ignored or currently unsupported - model: Optional[str] = Field(None) - n: Optional[int] = 1 - logprobs: Optional[int] = Field(None) - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - best_of: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) - - # llama.cpp specific parameters - top_k: int = 40 - repeat_penalty: float = 1.1 - - class Config: - schema_extra = { - "example": { - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": ["\n", "###"], - } - } - - -CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) - - -@app.post( - "/v1/completions", - response_model=CreateCompletionResponse, -) -def create_completion(request: CreateCompletionRequest): - if request.stream: - chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict()) # type: ignore - return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks) - return llama( - **request.dict( - exclude={ - "model", - "n", - "logprobs", - "frequency_penalty", - "presence_penalty", - "best_of", - "logit_bias", - "user", - } - ) - ) - - -class CreateEmbeddingRequest(BaseModel): - model: Optional[str] - input: str - user: Optional[str] - - class Config: - schema_extra = { - "example": { - "input": "The food was delicious and the waiter...", - } - } - - -CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) - - -@app.post( - "/v1/embeddings", - response_model=CreateEmbeddingResponse, -) -def create_embedding(request: CreateEmbeddingRequest): - return llama.create_embedding(**request.dict(exclude={"model", "user"})) - - -class ChatCompletionRequestMessage(BaseModel): - role: Union[Literal["system"], Literal["user"], Literal["assistant"]] - content: str - user: Optional[str] = None - - -class CreateChatCompletionRequest(BaseModel): - model: Optional[str] - messages: List[ChatCompletionRequestMessage] - temperature: float = 0.8 - top_p: float = 0.95 - stream: bool = False - stop: List[str] = [] - max_tokens: int = 128 - - # ignored or currently unsupported - model: Optional[str] = Field(None) - n: Optional[int] = 1 - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) - - # llama.cpp specific parameters - repeat_penalty: float = 1.1 - - class Config: - schema_extra = { - "example": { - "messages": [ - ChatCompletionRequestMessage( - role="system", content="You are a helpful assistant." - ), - ChatCompletionRequestMessage( - role="user", content="What is the capital of France?" - ), - ] - } - } - - -CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) - - -@app.post( - "/v1/chat/completions", - response_model=CreateChatCompletionResponse, -) -async def create_chat_completion( - request: CreateChatCompletionRequest, -) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: - completion_or_chunks = llama.create_chat_completion( - **request.dict( - exclude={ - "model", - "n", - "presence_penalty", - "frequency_penalty", - "logit_bias", - "user", - } - ), - ) - - if request.stream: - - async def server_sent_events( - chat_chunks: Iterator[llama_cpp.ChatCompletionChunk], - ): - for chat_chunk in chat_chunks: - yield dict(data=json.dumps(chat_chunk)) - yield dict(data="[DONE]") - - chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore - - return EventSourceResponse( - server_sent_events(chunks), - ) - completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore - return completion - - -class ModelData(TypedDict): - id: str - object: Literal["model"] - owned_by: str - permissions: List[str] - - -class ModelList(TypedDict): - object: Literal["list"] - data: List[ModelData] - - -GetModelResponse = create_model_from_typeddict(ModelList) - - -@app.get("/v1/models", response_model=GetModelResponse) -def get_models() -> ModelList: - return { - "object": "list", - "data": [ - { - "id": llama.model_path, - "object": "model", - "owned_by": "me", - "permissions": [], - } - ], - } +import uvicorn +from llama_cpp.server.app import create_app if __name__ == "__main__": - import os - import uvicorn + app = create_app() - uvicorn.run(app, host=os.getenv("HOST", "localhost"), port=os.getenv("PORT", 8000)) + uvicorn.run( + app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) + ) From b9098b0ef7309b63ebff99cdfadf641223c15025 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Tue, 2 May 2023 14:08:51 -0700 Subject: [PATCH 127/443] llama_cpp server: prompt is a string Not sure why this union type was here but taking a look at llama.py, prompt is only ever processed as a string for completion This was breaking types when generating an openapi client --- llama_cpp/server/app.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ef8aa4e..595476f 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -126,7 +126,7 @@ repeat_penalty_field = Field( ) class CreateCompletionRequest(BaseModel): - prompt: Union[str, List[str]] = Field( + prompt: Optional[str] = Field( default="", description="The prompt to generate completions for." ) @@ -175,9 +175,6 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) def create_completion( request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - if isinstance(request.prompt, list): - request.prompt = "".join(request.prompt) - completion_or_chunks = llama( **request.dict( exclude={ From 1d47cce22269f4422d0d302e9d3bab2583dfdb2a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 May 2023 09:33:30 -0400 Subject: [PATCH 128/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 5 +++-- vendor/llama.cpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index e0424c6..30414f5 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -71,7 +71,7 @@ LLAMA_FILE_VERSION = ctypes.c_int(1) LLAMA_FILE_MAGIC = b"ggjt" LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" LLAMA_SESSION_MAGIC = b"ggsn" -LLAMA_SESSION_VERSION = ctypes.c_int(0) +LLAMA_SESSION_VERSION = ctypes.c_int(1) llama_context_p = c_void_p @@ -239,7 +239,8 @@ _lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int] _lib.llama_set_rng_seed.restype = None -# Returns the size in bytes of the state (rng, logits, embedding and kv_cache) +# Returns the maximum size in bytes of the state (rng, logits, embedding +# and kv_cache) - will often be smaller after compacting tokens def llama_get_state_size(ctx: llama_context_p) -> c_size_t: return _lib.llama_get_state_size(ctx) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 58b367c..e216aa0 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 58b367c2d757c0ea12aec672382462b42204c724 +Subproject commit e216aa04633892b972d013719e38b59fd4917341 From 43f2907e3aecc81d26cc7c73b08b7a973a67aabb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 May 2023 09:33:50 -0400 Subject: [PATCH 129/443] Support smaller state sizes --- llama_cpp/llama.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index d201013..1b9f9e9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -53,12 +53,14 @@ class LlamaState: def __init__( self, eval_tokens: Deque[llama_cpp.llama_token], - eval_logits: Deque[List[float]], + eval_logits: Deque[List[llama_cpp.c_float]], llama_state, + llama_state_size: llama_cpp.c_size_t, ): self.eval_tokens = eval_tokens self.eval_logits = eval_logits self.llama_state = llama_state + self.llama_state_size = llama_state_size class Llama: @@ -950,19 +952,23 @@ class Llama: assert self.ctx is not None state_size = llama_cpp.llama_get_state_size(self.ctx) llama_state = (llama_cpp.c_uint8 * int(state_size))() - if llama_cpp.llama_copy_state_data(self.ctx, llama_state) != state_size: + n_bytes = llama_cpp.llama_copy_state_data(self.ctx, llama_state) + if int(n_bytes) > int(state_size): raise RuntimeError("Failed to copy llama state data") + llama_state_compact = (llama_cpp.c_uint8 * int(n_bytes))() + llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes)) return LlamaState( eval_tokens=self.eval_tokens.copy(), eval_logits=self.eval_logits.copy(), - llama_state=llama_state, + llama_state=llama_state_compact, + llama_state_size=n_bytes, ) def load_state(self, state: LlamaState) -> None: assert self.ctx is not None self.eval_tokens = state.eval_tokens.copy() self.eval_logits = state.eval_logits.copy() - state_size = llama_cpp.llama_get_state_size(self.ctx) + state_size = state.llama_state_size if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size: raise RuntimeError("Failed to set llama state data") From 9e5b6d675a49a4466b1fab841baf570e5efeb549 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 May 2023 10:28:10 -0400 Subject: [PATCH 130/443] Improve logging messages --- llama_cpp/llama.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1b9f9e9..fef7b3e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -396,7 +396,7 @@ class Llama: and tuple(self.eval_tokens) == tuple(tokens[: len(self.eval_tokens)]) ): if self.verbose: - print("generate cache hit", file=sys.stderr) + print("Llama.generate: cache hit", file=sys.stderr) reset = False tokens = tokens[len(self.eval_tokens) :] @@ -518,7 +518,7 @@ class Llama: if self.cache and prompt_tokens in self.cache: if self.verbose: - print("cache hit", file=sys.stderr) + print("Llama._create_completion: cache hit", file=sys.stderr) self.load_state(self.cache[prompt_tokens]) finish_reason = "length" @@ -538,7 +538,7 @@ class Llama: if self.cache and len(completion_tokens) == 0: if prompt_tokens not in self.cache: if self.verbose: - print("cache miss", file=sys.stderr) + print("Llama._create_completion: cache miss", file=sys.stderr) self.cache[prompt_tokens] = self.save_state() completion_tokens.append(token) @@ -957,6 +957,8 @@ class Llama: raise RuntimeError("Failed to copy llama state data") llama_state_compact = (llama_cpp.c_uint8 * int(n_bytes))() llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes)) + if self.verbose: + print(f"Llama.save_state: saving {n_bytes} bytes of llama state", file=sys.stderr) return LlamaState( eval_tokens=self.eval_tokens.copy(), eval_logits=self.eval_logits.copy(), From 7839eb14d3c75a589a3665e447e995745eee7f30 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 May 2023 10:29:05 -0400 Subject: [PATCH 131/443] Add docker cuda image. Closes #143 --- .github/workflows/build-docker.yaml | 34 ++++++++++++++++++++++++++++- Dockerfile.cuda | 15 +++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 Dockerfile.cuda diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 16b00a2..2b340d8 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -36,4 +36,36 @@ jobs: push: true # push to registry pull: true # always fetch the latest base images platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 - tags: ghcr.io/abetlen/llama-cpp-python:latest \ No newline at end of file + tags: ghcr.io/abetlen/llama-cpp-python:latest + + docker-cuda: + name: Build and push Docker image + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: "true" + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v4 + with: + file: Dockerfile.cuda + context: . + push: true # push to registry + pull: true # always fetch the latest base images + platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 + tags: ghcr.io/abetlen/llama-cpp-python-cuda:latest \ No newline at end of file diff --git a/Dockerfile.cuda b/Dockerfile.cuda new file mode 100644 index 0000000..a852f3c --- /dev/null +++ b/Dockerfile.cuda @@ -0,0 +1,15 @@ +FROM nvidia/cuda:12.1.1-devel-ubuntu20.04 + +# We need to set the host to 0.0.0.0 to allow outside access +ENV HOST 0.0.0.0 + +COPY . . + +# Install the package +RUN apt update && apt install -y python3 python3-pip +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette + +RUN LLAMA_CUBLAS=1 python3 setup.py develop + +# Run the server +CMD python3 -m llama_cpp.server \ No newline at end of file From 07a56dd9c2e16ba51313831d41016e13d3ce9a3f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 May 2023 10:39:39 -0400 Subject: [PATCH 132/443] Update job name --- .github/workflows/build-docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 2b340d8..878660e 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -39,7 +39,7 @@ jobs: tags: ghcr.io/abetlen/llama-cpp-python:latest docker-cuda: - name: Build and push Docker image + name: Build and push Docker CUDA image runs-on: ubuntu-latest steps: - name: Checkout From a02aa121da0639e93c58403d1d70d37bfc63c5f0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 May 2023 10:50:48 -0400 Subject: [PATCH 133/443] Remove cuda build job --- .github/workflows/build-docker.yaml | 32 ----------------------------- 1 file changed, 32 deletions(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 878660e..44196f1 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -37,35 +37,3 @@ jobs: pull: true # always fetch the latest base images platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 tags: ghcr.io/abetlen/llama-cpp-python:latest - - docker-cuda: - name: Build and push Docker CUDA image - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: "true" - - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build and push - uses: docker/build-push-action@v4 - with: - file: Dockerfile.cuda - context: . - push: true # push to registry - pull: true # always fetch the latest base images - platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 - tags: ghcr.io/abetlen/llama-cpp-python-cuda:latest \ No newline at end of file From 6d3c20e39dbae1a0c89e1ce6d5bec076b102f2e6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 May 2023 22:20:53 -0400 Subject: [PATCH 134/443] Add CUDA docker image build to github actions --- .github/workflows/build-docker.yaml | 38 +++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 44196f1..8ffa45f 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -37,3 +37,41 @@ jobs: pull: true # always fetch the latest base images platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 tags: ghcr.io/abetlen/llama-cpp-python:latest + + docker-cuda: + name: Build and push Docker CUDA image + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: "true" + + - name: Setup CUDA 12.1 + uses: Jimver/cuda-toolkit@v0.2.10 + id: cuda-toolkit + with: + cuda: '12.1.0' + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v4 + with: + file: Dockerfile.cuda + context: . + push: true # push to registry + pull: true # always fetch the latest base images + platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 + tags: ghcr.io/abetlen/llama-cpp-python-cuda:latest From 0607f6578efe03c7b8894d2ed5f71eaf03473c55 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 May 2023 23:22:16 -0400 Subject: [PATCH 135/443] Use network installer for cuda --- .github/workflows/build-docker.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 8ffa45f..2ec5c0d 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -52,6 +52,7 @@ jobs: id: cuda-toolkit with: cuda: '12.1.0' + method: network - name: Set up QEMU uses: docker/setup-qemu-action@v2 From d594892fd425cb41b30e4cb31e3aa5ef1c16e681 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 00:02:46 -0400 Subject: [PATCH 136/443] Remove Docker CUDA build job --- .github/workflows/build-docker.yaml | 41 +---------------------------- 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 2ec5c0d..16b00a2 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -36,43 +36,4 @@ jobs: push: true # push to registry pull: true # always fetch the latest base images platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 - tags: ghcr.io/abetlen/llama-cpp-python:latest - - docker-cuda: - name: Build and push Docker CUDA image - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: "true" - - - name: Setup CUDA 12.1 - uses: Jimver/cuda-toolkit@v0.2.10 - id: cuda-toolkit - with: - cuda: '12.1.0' - method: network - - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build and push - uses: docker/build-push-action@v4 - with: - file: Dockerfile.cuda - context: . - push: true # push to registry - pull: true # always fetch the latest base images - platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 - tags: ghcr.io/abetlen/llama-cpp-python-cuda:latest + tags: ghcr.io/abetlen/llama-cpp-python:latest \ No newline at end of file From 329297fafb4916951cf1c3146505a9501e986d95 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 12:18:40 -0400 Subject: [PATCH 137/443] Bugfix: Missing logits_to_logprobs --- llama_cpp/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index fef7b3e..8cd77ee 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -639,7 +639,7 @@ class Llama: self.detokenize([token]).decode("utf-8", errors="ignore") for token in all_tokens ] - all_logprobs = [Llama._logits_to_logprobs(row) for row in self.eval_logits] + all_logprobs = [Llama.logits_to_logprobs(list(map(float, row))) for row in self.eval_logits] for token, token_str, logprobs_token in zip( all_tokens, all_token_strs, all_logprobs ): @@ -985,7 +985,7 @@ class Llama: return llama_cpp.llama_token_bos() @staticmethod - def logits_to_logprobs(logits: List[llama_cpp.c_float]) -> List[llama_cpp.c_float]: + def logits_to_logprobs(logits: List[float]) -> List[float]: exps = [math.exp(float(x)) for x in logits] sum_exps = sum(exps) - return [llama_cpp.c_float(math.log(x / sum_exps)) for x in exps] + return [math.log(x / sum_exps) for x in exps] From d78cec67df876221471782e7e1fbe62abf48ee25 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 12:20:25 -0400 Subject: [PATCH 138/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e216aa0..2edbdb0 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e216aa04633892b972d013719e38b59fd4917341 +Subproject commit 2edbdb0f99336cb41f0995061c7602ed54beb863 From cabd8b8ed1ee45a19baa9436668898bbe9471492 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 12:21:20 -0400 Subject: [PATCH 139/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 64f7a0d..2dab374 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.41" +version = "0.1.42" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index f7f0fa4..0a52826 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.41", + version="0.1.42", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 0e9f227afd4537018c7fe5c3018b22871708cb65 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Thu, 4 May 2023 18:33:08 +0200 Subject: [PATCH 140/443] Update low level examples --- examples/low_level_api/Chat.py | 70 ++++++ examples/low_level_api/Miku.py | 59 +++++ examples/low_level_api/ReasonAct.py | 49 +++++ examples/low_level_api/common.py | 163 +++++++++----- .../low_level_api/low_level_api_chat_cpp.py | 202 +++++++++++++++--- .../low_level_api/low_level_api_llama_cpp.py | 35 ++- 6 files changed, 486 insertions(+), 92 deletions(-) create mode 100644 examples/low_level_api/Chat.py create mode 100644 examples/low_level_api/Miku.py create mode 100644 examples/low_level_api/ReasonAct.py diff --git a/examples/low_level_api/Chat.py b/examples/low_level_api/Chat.py new file mode 100644 index 0000000..1015508 --- /dev/null +++ b/examples/low_level_api/Chat.py @@ -0,0 +1,70 @@ +#!/bin/python +import sys, os, datetime +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +AI_NAME = env_or_def("AI_NAME", "ChatLLaMa") +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") +USER_NAME = env_or_def("USER_NAME", "USER") +N_PREDICTS = int(env_or_def("N_PREDICTS", "2048")) +N_THREAD = int(env_or_def("N_THREAD", "8")) + +today = datetime.datetime.today() +DATE_YEAR=today.strftime("%Y") +DATE_TIME=today.strftime("%H:%M") + +prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}. +{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision. +There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other. +The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. +The transcript only includes text, it does not include markup like HTML and Markdown. + +{USER_NAME}: Hello, {AI_NAME}! +{AI_NAME}: Hello {USER_NAME}! How may I help you today? +{USER_NAME}: What year is it? +{AI_NAME}: We are in {DATE_YEAR}. +{USER_NAME}: Please tell me the largest city in Europe. +{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia. +{USER_NAME}: What can you tell me about Moscow? +{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center. +{USER_NAME}: What is a cat? +{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. +{USER_NAME}: How do I pass command line arguments to a Node.js program? +{AI_NAME}: The arguments are stored in process.argv. + + argv[0] is the path to the Node. js executable. + argv[1] is the path to the script file. + argv[2] is the first argument passed to the script. + argv[3] is the second argument passed to the script and so on. +{USER_NAME}: Name a color. +{AI_NAME}: Blue. +{USER_NAME}: What time is it? +{AI_NAME}: It is {DATE_TIME}. +{USER_NAME}:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + n_ctx=2048, + temp=0.7, + top_k=40, + top_p=0.5, + repeat_last_n=256, + n_batch=1024, + repeat_penalty=1.17647, + model=MODEL, + n_threads=N_THREAD, + n_predict=N_PREDICTS, + use_color=True, + interactive=True, + antiprompt=[f"{USER_NAME}:"], + input_prefix=" ", + prompt=prompt, +) + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/Miku.py b/examples/low_level_api/Miku.py new file mode 100644 index 0000000..eb9a2cf --- /dev/null +++ b/examples/low_level_api/Miku.py @@ -0,0 +1,59 @@ +#!/bin/python +import sys, os +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +AI_NAME = env_or_def("AI_NAME", "Miku") +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") +USER_NAME = env_or_def("USER_NAME", "Anon") +N_PREDICTS = int(env_or_def("N_PREDICTS", "4096")) +N_THREAD = int(env_or_def("N_THREAD", "0")) + +prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer. +{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. +{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. +{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad. +{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her. +The conversation is only between {USER_NAME} and {AI_NAME} +The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice. +{AI_NAME} can only communicate through text, so she can't send images or videos. + + +{USER_NAME}: Hello! +{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression! +{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^ +{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :) +{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant! +{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off! +{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that! +{AI_NAME}: What do you like to do in your free time? ^_^ +{USER_NAME}:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + n_batch=1024, + n_ctx=2048, + n_keep=-1, + repeat_last_n=256, + repeat_penalty=1.17647, + temp=0.7, + top_k=40, + top_p=0.5, + model=MODEL, + n_predict=N_PREDICTS, + use_color=True, + interactive=True, + antiprompt=[f"{USER_NAME}:"], + prompt=prompt, +) + +if N_THREAD > 0: + params.n_threads = N_THREAD + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/ReasonAct.py b/examples/low_level_api/ReasonAct.py new file mode 100644 index 0000000..82e5c44 --- /dev/null +++ b/examples/low_level_api/ReasonAct.py @@ -0,0 +1,49 @@ +#!/bin/python +import sys, os, datetime +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") + +prompt=f"""You run in a loop of Thought, Action, Observation. +At the end of the loop either Answer or restate your Thought and Action. +Use Thought to describe your thoughts about the question you have been asked. +Use Action to run one of these actions available to you: +- calculate[python math expression] +Observation will be the result of running those actions + + +Question: What is 4 * 7 / 3? +Thought: Do I need to use an action? Yes, I use calculate to do math +Action: calculate[4 * 7 / 3] +Observation: 9.3333333333 +Thought: Do I need to use an action? No, have the result +Answer: The calculate tool says it is 9.3333333333 +Question: What is capital of france? +Thought: Do I need to use an action? No, I know the answer +Answer: Paris is the capital of France +Question:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + interactive=True, + interactive_start=True, + top_k=10000, + temp=0.2, + repeat_penalty=1, + n_threads=7, + n_ctx=2048, + antiprompt=["Question:","Observation:"], + model=MODEL, + input_prefix=" ", + n_predict=-1, + prompt=prompt, +) + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 061ec3a..6c35cc5 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -1,8 +1,9 @@ import os import argparse +import re from dataclasses import dataclass, field -from typing import List, Optional +from typing import List # Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp @@ -12,23 +13,35 @@ class GptParams: seed: int = -1 n_threads: int = min(4, os.cpu_count() or 1) n_predict: int = 128 - repeat_last_n: int = 64 n_parts: int = -1 n_ctx: int = 512 n_batch: int = 8 n_keep: int = 0 + ignore_eos: bool = False + logit_bias: dict[int, float] = field(default_factory=dict) top_k: int = 40 top_p: float = 0.95 + tfs_z: float = 1.00 + typical_p: float = 1.00 temp: float = 0.80 repeat_penalty: float = 1.10 + repeat_last_n: int = 64 + frequency_penalty: float = 0.0 + presence_penalty: float = 0.0 + mirostat: int = 0 + mirostat_tau: float = 5.0 + mirostat_eta: float = 0.1 model: str = "./models/llama-7B/ggml-model.bin" prompt: str = "" + path_session: str = "" input_prefix: str = " " - antiprompt: List[str] = field(default_factory=list) + lora_adapter: str = "" + lora_base: str = "" + memory_f16: bool = True random_prompt: bool = False use_color: bool = False @@ -38,7 +51,7 @@ class GptParams: interactive_start: bool = False instruct: bool = False - ignore_eos: bool = False + penalize_nl: bool = True perplexity: bool = False use_mmap: bool = True use_mlock: bool = False @@ -61,59 +74,42 @@ class GptParams: instruct_inp_suffix: str="\n\n### Response:\n\n" -def gpt_params_parse(argv = None, params: Optional[GptParams] = None): - if params is None: - params = GptParams() - +def gpt_params_parse(argv = None): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed") parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads") - parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") - parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") + parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict") + parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts") parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx") - parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16") - parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") - parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") - parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") - parser.add_argument("--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict") - parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") - parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch") parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep") + + parser.add_argument( + "-l", + "--logit-bias", + type=str, + action='append', + help="--logit-bias TOKEN_ID(+/-)BIAS", + dest="logit_bias_str" + ) + parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos") + parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") + parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") + parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z") + parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") + parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") + parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") + parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z") + parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty") + parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat") + parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau",dest="mirostat_tau") + parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta") + parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model") - parser.add_argument( - "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" - ) - parser.add_argument("--embedding", action="store_true", help="", dest="embedding") - parser.add_argument( - "--interactive-start", - action="store_true", - help="run in interactive mode", - dest="interactive" - ) - parser.add_argument( - "--interactive-first", - action="store_true", - help="run in interactive mode and wait for input right away", - dest="interactive_start" - ) - parser.add_argument( - "-ins", - "--instruct", - action="store_true", - help="run in instruction mode (use with Alpaca or Vicuna models)", - dest="instruct" - ) - parser.add_argument( - "--color", - action="store_true", - help="colorise output to distinguish prompt and user input from generations", - dest="use_color" - ) - parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") - parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap") - parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test") - parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt") + parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") + parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") + parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session") + parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") parser.add_argument( "-r", "--reverse-prompt", @@ -122,16 +118,71 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None): help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).", dest="antiprompt" ) - parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity") - parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos") - parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts") + + parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter") + parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base") + + parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16") parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt") - parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") + parser.add_argument( + "--color", + action="store_true", + help="colorise output to distinguish prompt and user input from generations", + dest="use_color" + ) + parser.add_argument( + "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" + ) + + parser.add_argument("--embedding", action="store_true", help="", dest="embedding") + parser.add_argument( + "--interactive-first", + action="store_true", + help="run in interactive mode and wait for input right away", + dest="interactive_start" + ) + + parser.add_argument( + "-ins", + "--instruct", + action="store_true", + help="run in instruction mode (use with Alpaca or Vicuna models)", + dest="instruct" + ) + parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl") + parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity") + parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap") + parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") + parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test") + parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt") + + #Custom args parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix") parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix") parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo") + + parser.add_argument( + "--interactive-start", + action="store_true", + help="run in interactive mode", + dest="interactive" + ) + args = parser.parse_args(argv) - return args + + logit_bias_str = args.logit_bias_str + delattr(args, "logit_bias_str") + params = GptParams(**vars(args)) + + if (params.lora_adapter): + params.use_mmap = False + + if (logit_bias_str != None): + for i in logit_bias_str: + if (m := re.match(r"(\d+)([-+]\d+)", i)): + params.logit_bias[int(m.group(1))] = int(m.group(2)) + + return params def gpt_random_prompt(rng): return [ @@ -148,4 +199,4 @@ def gpt_random_prompt(rng): ][rng % 10] if __name__ == "__main__": - print(GptParams(gpt_params_parse())) + print(gpt_params_parse()) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 6fced65..4e129ee 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -10,9 +10,10 @@ Quirks: You should also still be feeding the model with a "primer" prompt that shows it the expected format. """ +import ctypes import sys from time import time -from os import cpu_count +from os import cpu_count, path import llama_cpp from common import GptParams, gpt_params_parse, gpt_random_prompt @@ -77,6 +78,7 @@ specified) expect poor results""", file=sys.stderr) # runtime args self.input_consumed = 0 self.n_past = 0 + self.n_session_consumed = 0 self.first_antiprompt = [] self.remaining_tokens = self.params.n_predict self.output_echo = self.params.input_echo @@ -94,6 +96,19 @@ specified) expect poor results""", file=sys.stderr) if (not self.ctx): raise RuntimeError(f"error: failed to load model '{self.params.model}'") + if (self.params.ignore_eos): + self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf") + + if (len(self.params.lora_adapter) > 0): + if (llama_cpp.llama_apply_lora_from_file( + self.ctx, + self.params.lora_adapter, + self.params.lora_base if len(self.params.lora_base) > 0 else None, + self.params.n_threads + ) != 0): + print("error: failed to apply lora adapter") + return + print(file=sys.stderr) print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ | {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr) @@ -117,13 +132,49 @@ specified) expect poor results""", file=sys.stderr) with open(self.params.file) as f: self.params.prompt = f.read() + self.session_tokens: list[llama_cpp.llama_token] = [] + if (len(self.params.path_session) > 0): + print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr) + + if (path.exists(self.params.path_session)): + _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))() + _n_token_count_out = llama_cpp.c_int() + if (llama_cpp.llama_load_session_file( + self.ctx, + self.params.path_session.encode("utf8"), + _session_tokens, + self.params.n_ctx, + ctypes.byref(_n_token_count_out) + ) != 0): + print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr) + return + self.session_tokens = _session_tokens[:_n_token_count_out] + print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr) + else: + print(f"session file does not exist, will create", file=sys.stderr) + # tokenize the prompt self.embd = [] self.embd_inp = self._tokenize(self.params.prompt) - if (len(self.embd_inp) > self.params.n_ctx - 4): + if (len(self.embd_inp) > self.n_ctx - 4): raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})") + # debug message about similarity of saved session, if applicable + n_matching_session_tokens = 0 + if len(self.session_tokens) > 0: + for id in self.session_tokens: + if n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[n_matching_session_tokens]: + break + n_matching_session_tokens += 1 + + if n_matching_session_tokens >= len(self.embd_inp): + print(f"session file has exact match for prompt!") + elif n_matching_session_tokens < (len(self.embd_inp) / 2): + print(f"warning: session file has low similarity to prompt ({n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") + else: + print(f"session file matches {n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") + # number of tokens to keep when resetting context if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct): self.params.n_keep = len(self.embd_inp) @@ -132,6 +183,7 @@ specified) expect poor results""", file=sys.stderr) self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False) # in instruct mode, we inject a prefix and a suffix to each input by the user + self.antiecho = None if (self.params.instruct): self.params.interactive_start = True _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False) @@ -171,16 +223,24 @@ number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr) if len(self.params.input_prefix) > 0: print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr) - print(f"""sampling: temp = {self.params.temp},\ + print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\ +repeat_penalty = {self.params.repeat_penalty},\ +presence_penalty = {self.params.presence_penalty},\ +frequency_penalty = {self.params.frequency_penalty},\ top_k = {self.params.top_k},\ +tfs_z = {self.params.tfs_z},\ top_p = {self.params.top_p},\ -repeat_last_n = {self.params.repeat_last_n},\ -repeat_penalty = {self.params.repeat_penalty} +typical_p = {self.params.typical_p},\ +temp = {self.params.temp},\ +mirostat = {self.params.mirostat},\ +mirostat_lr = {self.params.mirostat_eta},\ +mirostat_ent = {self.params.mirostat_tau},\ -generate: n_ctx = {self.n_ctx}, \ -n_batch = {self.params.n_batch}, \ -n_predict = {self.params.n_predict}, \ +generate: n_ctx = {self.n_ctx},\ +n_batch = {self.params.n_batch},\ +n_predict = {self.params.n_predict},\ n_keep = {self.params.n_keep} + """, file=sys.stderr) # determine antiprompt tokens @@ -198,6 +258,9 @@ n_keep = {self.params.n_keep} """, file=sys.stderr) self.set_color(CONSOLE_COLOR_PROMPT) + self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) + + # tokenize a prompt def _tokenize(self, prompt, bos=True): _arr = (llama_cpp.llama_token * (len(prompt) + 1))() @@ -229,31 +292,117 @@ n_keep = {self.params.n_keep} self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd) ] self.embd = _insert + self.embd + self.params.path_session = "" + + # try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + # REVIEW + if self.n_session_consumed < len(self.session_tokens): + for i in range(len(self.embd)): + if self.embd[i] != self.session_tokens[self.n_session_consumed]: + self.session_tokens = self.session_tokens[:self.n_session_consumed] + break + + self.n_past += 1 + self.n_session_consumed += 1 + + if self.n_session_consumed >= len(self.session_tokens): + i += 1 + break + + if i > 0: + self.embd = self.embd[i:] + + # evaluate tokens in batches + # embd is typically prepared beforehand to fit within a batch, but not always + #TODO BUG: The batching code causes nonsensical generation + """for i in range(0, len(self.embd), self.params.n_batch): + n_eval = self.params.n_batch + _arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval]) + if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0: + print(f"failed to eval") + return + + self.n_past += n_eval""" if (llama_cpp.llama_eval( self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads ) != 0): raise Exception("Failed to llama_eval!") + if len(self.embd) > 0 and not len(self.params.path_session) > 0: + self.session_tokens.extend(self.embd) + self.n_session_consumed = len(self.session_tokens) + self.n_past += len(self.embd) self.embd = [] - if len(self.embd_inp) <= self.input_consumed: + if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting # out of user input, sample next token + top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k + repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n - if (self.params.ignore_eos): - logits = llama_cpp.llama_get_logits(self.ctx) - logits[llama_cpp.llama_token_eos()] = llama_cpp.c_float(0) + # optionally save the session on first sample (for faster prompt loading next time) + if len(self.params.path_session) > 0 and self.need_to_save_session: + self.need_to_save_session = False + llama_cpp.llama_save_session_file( + self.ctx, + self.params.path_session.encode("utf8"), + self.session_tokens, + len(self.session_tokens) + ) + + id = 0 + + logits = llama_cpp.llama_get_logits(self.ctx) + n_vocab = llama_cpp.llama_n_vocab(self.ctx) + + # Apply params.logit_bias map + for key, value in self.params.logit_bias.items(): + logits[key] += value + + _arr = (llama_cpp.llama_token_data * n_vocab)(*[ + llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + for token_id in range(n_vocab) + ]) + candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) + + # Apply penalties + nl_logit = logits[llama_cpp.llama_token_nl()] + last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx) + + _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:]) + llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p, + _arr, + last_n_repeat, self.params.repeat_penalty) + llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p, + _arr, + last_n_repeat, self.params.frequency_penalty, self.params.presence_penalty) + + if not self.params.penalize_nl: + logits[llama_cpp.llama_token_nl()] = nl_logit + + if self.params.temp <= 0: + # Greedy sampling + id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p) + else: + if self.params.mirostat == 1: + mirostat_mu = 2.0 * self.params.mirostat_tau + mirostat_m = 100 + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) + id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_m, mirostat_mu) + elif self.params.mirostat == 2: + mirostat_mu = 2.0 * self.params.mirostat_tau + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) + id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_mu) + else: + # Temperature sampling + llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, self.params.tfs_z) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, self.params.typical_p) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, self.params.top_p) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) + id = llama_cpp.llama_sample_token(self.ctx, candidates_p) + # print("`{}`".format(candidates_p.size)) - _arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):] - id = llama_cpp.llama_sample_top_p_top_k( - self.ctx, - (llama_cpp.llama_token * len(_arr))(*_arr), - len(_arr), - self.params.top_k, - self.params.top_p, - self.params.temp, - self.params.repeat_penalty, - ) self.last_n_tokens.pop(0) self.last_n_tokens.append(id) @@ -288,7 +437,7 @@ n_keep = {self.params.n_keep} # display tokens if self.output_echo: for id in self.embd: - if self.params.instruct: + if self.antiecho != None: for r in self.antiecho(id): yield r else: @@ -316,7 +465,7 @@ n_keep = {self.params.n_keep} if (not self.params.instruct): for i in self.llama_token_eot: yield i - break + break # respect n_predict even if antiprompt is present if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1): @@ -356,7 +505,7 @@ n_keep = {self.params.n_keep} def output(self): self.remaining_tokens = self.params.n_predict for id in self.generate(): - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore") + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") # read user input def read_input(self): @@ -415,8 +564,7 @@ The transcript only includes text, it does not include markup like HTML and Mark {USER_NAME}: Name a color. {AI_NAME}: Blue {USER_NAME}:""" - args = gpt_params_parse() - params = GptParams(**vars(args)) + params = gpt_params_parse() with LLaMAInteract(params) as m: m.interact() diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py index 4fb5a03..9e38ec7 100644 --- a/examples/low_level_api/low_level_api_llama_cpp.py +++ b/examples/low_level_api/low_level_api_llama_cpp.py @@ -37,6 +37,10 @@ embd = [] last_n_size = 64 last_n_tokens_data = [0] * last_n_size n_batch = 24 +last_n_repeat = 64 +repeat_penalty = 1 +frequency_penalty = 0.0 +presence_penalty = 0.0 while remaining_tokens > 0: if len(embd) > 0: @@ -47,15 +51,28 @@ while remaining_tokens > 0: n_past += len(embd) embd = [] if len(embd_inp) <= input_consumed: - id = llama_cpp.llama_sample_top_p_top_k( - ctx, - (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data), - len(last_n_tokens_data), - 40, - 0.8, - 0.2, - 1.0 / 0.85, - ) + logits = llama_cpp.llama_get_logits(ctx) + n_vocab = llama_cpp.llama_n_vocab(ctx) + + _arr = (llama_cpp.llama_token_data * n_vocab)(*[ + llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + for token_id in range(n_vocab) + ]) + candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) + + _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data) + llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p, + _arr, + last_n_repeat, repeat_penalty) + llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p, + _arr, + last_n_repeat, frequency_penalty, presence_penalty) + + llama_cpp.llama_sample_top_k(ctx, candidates_p, 40) + llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8) + llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2) + id = llama_cpp.llama_sample_token(ctx, candidates_p) + last_n_tokens_data = last_n_tokens_data[1:] + [id] embd.append(id) input_noecho = False From 501321875f449594c249cdbbc9b48208fbce4bde Mon Sep 17 00:00:00 2001 From: Thomas Neu <81517187+th-neu@users.noreply.github.com> Date: Thu, 4 May 2023 21:03:19 +0200 Subject: [PATCH 141/443] Slim-Bullseye based docker image ends up at ~669MB --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 14fb3be..f58506f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3-bullseye +FROM python:3-slim-bullseye # We need to set the host to 0.0.0.0 to allow outside access ENV HOST 0.0.0.0 @@ -6,10 +6,10 @@ ENV HOST 0.0.0.0 COPY . . # Install the package -RUN apt update && apt install -y libopenblas-dev +RUN apt update && apt install -y libopenblas-dev ninja-build build-essential RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette RUN LLAMA_OPENBLAS=1 python3 setup.py develop # Run the server -CMD python3 -m llama_cpp.server \ No newline at end of file +CMD python3 -m llama_cpp.server From 97c6372350c57a4fffb6072cb299e5a9bd8b38dc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 21:58:27 -0400 Subject: [PATCH 142/443] Rewind model to longest prefix. --- llama_cpp/llama.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 8cd77ee..7a8c25b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -390,18 +390,28 @@ class Llama: """ assert self.ctx is not None - if ( - reset - and len(self.eval_tokens) > 0 - and tuple(self.eval_tokens) == tuple(tokens[: len(self.eval_tokens)]) - ): - if self.verbose: - print("Llama.generate: cache hit", file=sys.stderr) - reset = False - tokens = tokens[len(self.eval_tokens) :] + if reset and len(self.eval_tokens) > 0: + longest_prefix = 0 + for a, b in zip(self.eval_tokens, tokens[:-1]): + if a == b: + longest_prefix += 1 + else: + break + if longest_prefix > 0: + if self.verbose: + print("Llama.generate: prefix-match hit", file=sys.stderr) + reset = False + tokens = tokens[longest_prefix:] + for _ in range(len(self.eval_tokens) - longest_prefix): + self.eval_tokens.pop() + try: + self.eval_logits.pop() + except IndexError: + pass if reset: self.reset() + while True: self.eval(tokens) token = self.sample( From 853dc711cc5507ca119cb822f459cd16c9021f15 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 21:58:36 -0400 Subject: [PATCH 143/443] Format --- llama_cpp/llama.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7a8c25b..32d5424 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -649,7 +649,10 @@ class Llama: self.detokenize([token]).decode("utf-8", errors="ignore") for token in all_tokens ] - all_logprobs = [Llama.logits_to_logprobs(list(map(float, row))) for row in self.eval_logits] + all_logprobs = [ + Llama.logits_to_logprobs(list(map(float, row))) + for row in self.eval_logits + ] for token, token_str, logprobs_token in zip( all_tokens, all_token_strs, all_logprobs ): @@ -968,7 +971,10 @@ class Llama: llama_state_compact = (llama_cpp.c_uint8 * int(n_bytes))() llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes)) if self.verbose: - print(f"Llama.save_state: saving {n_bytes} bytes of llama state", file=sys.stderr) + print( + f"Llama.save_state: saving {n_bytes} bytes of llama state", + file=sys.stderr, + ) return LlamaState( eval_tokens=self.eval_tokens.copy(), eval_logits=self.eval_logits.copy(), From 5c165a85da5a340aca85a44e2282db2e5f729463 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 21:59:37 -0400 Subject: [PATCH 144/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2dab374..ca0346f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.42" +version = "0.1.43" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 0a52826..405886a 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.42", + version="0.1.43", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 952ba9ecaf7a78be1844a1c533d6f6f580b92833 Mon Sep 17 00:00:00 2001 From: Thomas Neu <81517187+th-neu@users.noreply.github.com> Date: Fri, 5 May 2023 14:21:57 +0200 Subject: [PATCH 145/443] Update README.md add windows server commad --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index a8afa67..ee6ec2d 100644 --- a/README.md +++ b/README.md @@ -64,12 +64,20 @@ This allows you to use llama.cpp compatible models with any OpenAI compatible cl To install the server package and get started: +Linux ```bash pip install llama-cpp-python[server] export MODEL=./models/7B/ggml-model.bin python3 -m llama_cpp.server ``` +Windows +```cmd +pip install llama-cpp-python[server] +SET MODEL=\models\7B\ggml-model.bin +python3 -m llama_cpp.server +``` + Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. ## Docker image From eb54e30f343251767ec0a2cb10da2684b896718f Mon Sep 17 00:00:00 2001 From: Thomas Neu <81517187+th-neu@users.noreply.github.com> Date: Fri, 5 May 2023 14:22:41 +0200 Subject: [PATCH 146/443] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ee6ec2d..d24bad5 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ python3 -m llama_cpp.server Windows ```cmd pip install llama-cpp-python[server] -SET MODEL=\models\7B\ggml-model.bin +SET MODEL=..\models\7B\ggml-model.bin python3 -m llama_cpp.server ``` From 24fc38754b6da802ae5b32fb301e957868ec5e86 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 12:08:28 -0400 Subject: [PATCH 147/443] Add cli options to server. Closes #37 --- llama_cpp/server/__main__.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 4fbee37..5c9598a 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -22,12 +22,26 @@ Then visit http://localhost:8000/docs to see the interactive API docs. """ import os +import argparse + import uvicorn -from llama_cpp.server.app import create_app +from llama_cpp.server.app import create_app, Settings if __name__ == "__main__": - app = create_app() + parser = argparse.ArgumentParser() + for name, field in Settings.__fields__.items(): + parser.add_argument( + f"--{name}", + dest=name, + type=field.type_, + default=field.default, + help=field.field_info.description, + ) + + args = parser.parse_args() + settings = Settings(**vars(args)) + app = create_app(settings=settings) uvicorn.run( app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) From 5be0efa5f8f98f4b889ca9869e5005ecb5f195d2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 12:21:49 -0400 Subject: [PATCH 148/443] Cache should raise KeyError when key is missing --- llama_cpp/llama.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 32d5424..4e03ed4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -33,12 +33,10 @@ class LlamaCache: return k return None - def __getitem__( - self, key: Sequence[llama_cpp.llama_token] - ) -> Optional["LlamaState"]: + def __getitem__(self, key: Sequence[llama_cpp.llama_token]) -> "LlamaState": _key = self._find_key(tuple(key)) if _key is None: - return None + raise KeyError(f"Key not found: {key}") return self.cache_state[_key] def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool: From b6a9a0b6ba74c8b539e98ec31fc6558563b20c96 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 12:22:27 -0400 Subject: [PATCH 149/443] Add types for all low-level api functions --- llama_cpp/llama.py | 2 +- llama_cpp/llama_cpp.py | 81 +++++++++++++++++++++++++++++++----------- 2 files changed, 62 insertions(+), 21 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4e03ed4..c1c8847 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -52,7 +52,7 @@ class LlamaState: self, eval_tokens: Deque[llama_cpp.llama_token], eval_logits: Deque[List[llama_cpp.c_float]], - llama_state, + llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] llama_state_size: llama_cpp.c_size_t, ): self.eval_tokens = eval_tokens diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 30414f5..0a35445 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -17,7 +17,7 @@ import pathlib # Load the library -def _load_shared_library(lib_base_name): +def _load_shared_library(lib_base_name: str): # Determine the file extension based on the platform if sys.platform.startswith("linux"): lib_ext = ".so" @@ -252,7 +252,9 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied -def llama_copy_state_data(ctx: llama_context_p, dest) -> c_size_t: +def llama_copy_state_data( + ctx: llama_context_p, dest # type: Array[c_uint8] +) -> c_size_t: return _lib.llama_copy_state_data(ctx, dest) @@ -262,7 +264,9 @@ _lib.llama_copy_state_data.restype = c_size_t # Set the state reading from the specified address # Returns the number of bytes read -def llama_set_state_data(ctx: llama_context_p, src) -> c_size_t: +def llama_set_state_data( + ctx: llama_context_p, src # type: Array[c_uint8] +) -> c_size_t: return _lib.llama_set_state_data(ctx, src) @@ -274,9 +278,9 @@ _lib.llama_set_state_data.restype = c_size_t def llama_load_session_file( ctx: llama_context_p, path_session: bytes, - tokens_out, + tokens_out, # type: Array[llama_token] n_token_capacity: c_size_t, - n_token_count_out, + n_token_count_out, # type: Array[c_size_t] ) -> c_size_t: return _lib.llama_load_session_file( ctx, path_session, tokens_out, n_token_capacity, n_token_count_out @@ -294,7 +298,10 @@ _lib.llama_load_session_file.restype = c_size_t def llama_save_session_file( - ctx: llama_context_p, path_session: bytes, tokens, n_token_count: c_size_t + ctx: llama_context_p, + path_session: bytes, + tokens, # type: Array[llama_token] + n_token_count: c_size_t, ) -> c_size_t: return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) @@ -433,8 +440,8 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, - candidates, - last_tokens_data, + candidates, # type: Array[llama_token_data] + last_tokens_data, # type: Array[llama_token] last_tokens_size: c_int, penalty: c_float, ): @@ -456,8 +463,8 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, - candidates, - last_tokens_data, + candidates, # type: Array[llama_token_data] + last_tokens_data, # type: Array[llama_token] last_tokens_size: c_int, alpha_frequency: c_float, alpha_presence: c_float, @@ -484,7 +491,10 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -def llama_sample_softmax(ctx: llama_context_p, candidates): +def llama_sample_softmax( + ctx: llama_context_p, + candidates # type: Array[llama_token_data] +): return _lib.llama_sample_softmax(ctx, candidates) @@ -497,7 +507,10 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_k( - ctx: llama_context_p, candidates, k: c_int, min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + k: c_int, + min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -513,7 +526,10 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_p( - ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + p: c_float, + min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -529,7 +545,10 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( - ctx: llama_context_p, candidates, z: c_float, min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + z: c_float, + min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -545,7 +564,10 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. def llama_sample_typical( - ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + p: c_float, + min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) @@ -559,7 +581,11 @@ _lib.llama_sample_typical.argtypes = [ _lib.llama_sample_typical.restype = None -def llama_sample_temperature(ctx: llama_context_p, candidates, temp: c_float): +def llama_sample_temperature( + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + temp: c_float +): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -578,7 +604,12 @@ _lib.llama_sample_temperature.restype = None # @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( - ctx: llama_context_p, candidates, tau: c_float, eta: c_float, m: c_int, mu + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + tau: c_float, + eta: c_float, + m: c_int, + mu # type: Array[c_float] ) -> llama_token: return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -600,7 +631,11 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( - ctx: llama_context_p, candidates, tau: c_float, eta: c_float, mu + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + tau: c_float, + eta: c_float, + mu # type: Array[c_float] ) -> llama_token: return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) @@ -616,7 +651,10 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. -def llama_sample_token_greedy(ctx: llama_context_p, candidates) -> llama_token: +def llama_sample_token_greedy( + ctx: llama_context_p, + candidates # type: Array[llama_token_data] +) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -628,7 +666,10 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. -def llama_sample_token(ctx: llama_context_p, candidates) -> llama_token: +def llama_sample_token( + ctx: llama_context_p, + candidates # type: Array[llama_token_data] +) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From 22c3056b2a8d19f2c5ce9ab817e312da21e66d9c Mon Sep 17 00:00:00 2001 From: Thomas Neu <81517187+th-neu@users.noreply.github.com> Date: Fri, 5 May 2023 18:40:00 +0200 Subject: [PATCH 150/443] Update README.md added MacOS --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d24bad5..c46fa11 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ This allows you to use llama.cpp compatible models with any OpenAI compatible cl To install the server package and get started: -Linux +Linux/MacOS ```bash pip install llama-cpp-python[server] export MODEL=./models/7B/ggml-model.bin From 5e7ddfc3d6933471ba503477c0513a8987db4d9a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 13:54:22 -0400 Subject: [PATCH 151/443] Fix llama_cpp types --- llama_cpp/llama_cpp.py | 74 +++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 0a35445..87d9249 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -8,6 +8,7 @@ from ctypes import ( c_void_p, c_bool, POINTER, + _Pointer, # type: ignore Structure, Array, c_uint8, @@ -252,9 +253,7 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied -def llama_copy_state_data( - ctx: llama_context_p, dest # type: Array[c_uint8] -) -> c_size_t: +def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_t: return _lib.llama_copy_state_data(ctx, dest) @@ -278,9 +277,9 @@ _lib.llama_set_state_data.restype = c_size_t def llama_load_session_file( ctx: llama_context_p, path_session: bytes, - tokens_out, # type: Array[llama_token] + tokens_out: Array[llama_token], n_token_capacity: c_size_t, - n_token_count_out, # type: Array[c_size_t] + n_token_count_out: _Pointer[c_size_t], ) -> c_size_t: return _lib.llama_load_session_file( ctx, path_session, tokens_out, n_token_capacity, n_token_count_out @@ -300,7 +299,7 @@ _lib.llama_load_session_file.restype = c_size_t def llama_save_session_file( ctx: llama_context_p, path_session: bytes, - tokens, # type: Array[llama_token] + tokens: Array[llama_token], n_token_count: c_size_t, ) -> c_size_t: return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) @@ -321,7 +320,7 @@ _lib.llama_save_session_file.restype = c_size_t # Returns 0 on success def llama_eval( ctx: llama_context_p, - tokens, # type: Array[llama_token] + tokens: Array[llama_token], n_tokens: c_int, n_past: c_int, n_threads: c_int, @@ -440,8 +439,8 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - last_tokens_data, # type: Array[llama_token] + candidates: _Pointer[llama_token_data], + last_tokens_data: Array[llama_token], last_tokens_size: c_int, penalty: c_float, ): @@ -463,8 +462,8 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - last_tokens_data, # type: Array[llama_token] + candidates: _Pointer[llama_token_data], + last_tokens_data: Array[llama_token], last_tokens_size: c_int, alpha_frequency: c_float, alpha_presence: c_float, @@ -491,10 +490,7 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -def llama_sample_softmax( - ctx: llama_context_p, - candidates # type: Array[llama_token_data] -): +def llama_sample_softmax(ctx: llama_context_p, candidates: _Pointer[llama_token_data]): return _lib.llama_sample_softmax(ctx, candidates) @@ -507,10 +503,10 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_k( - ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - k: c_int, - min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates: _Pointer[llama_token_data], + k: c_int, + min_keep: c_size_t = c_size_t(1), ): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -526,10 +522,10 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_p( - ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - p: c_float, - min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates: _Pointer[llama_token_data], + p: c_float, + min_keep: c_size_t = c_size_t(1), ): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -546,9 +542,9 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] + candidates: _Pointer[llama_token_data], z: c_float, - min_keep: c_size_t = c_size_t(1) + min_keep: c_size_t = c_size_t(1), ): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -565,9 +561,9 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. def llama_sample_typical( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - p: c_float, - min_keep: c_size_t = c_size_t(1) + candidates: _Pointer[llama_token_data], + p: c_float, + min_keep: c_size_t = c_size_t(1), ): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) @@ -582,9 +578,7 @@ _lib.llama_sample_typical.restype = None def llama_sample_temperature( - ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - temp: c_float + ctx: llama_context_p, candidates: _Pointer[llama_token_data], temp: c_float ): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -605,11 +599,11 @@ _lib.llama_sample_temperature.restype = None # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] + candidates: _Pointer[llama_token_data], tau: c_float, - eta: c_float, + eta: c_float, m: c_int, - mu # type: Array[c_float] + mu: _Pointer[c_float], ) -> llama_token: return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -632,10 +626,10 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - tau: c_float, + candidates: _Pointer[llama_token_data], + tau: c_float, eta: c_float, - mu # type: Array[c_float] + mu: _Pointer[c_float], ) -> llama_token: return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) @@ -652,8 +646,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. def llama_sample_token_greedy( - ctx: llama_context_p, - candidates # type: Array[llama_token_data] + ctx: llama_context_p, candidates: _Pointer[llama_token_data] ) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -667,8 +660,7 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. def llama_sample_token( - ctx: llama_context_p, - candidates # type: Array[llama_token_data] + ctx: llama_context_p, candidates: _Pointer[llama_token_data] ) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From 6702d2abfdc313873931baa470b8b547dd825727 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:00:30 -0400 Subject: [PATCH 152/443] Fix candidates type --- llama_cpp/llama_cpp.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 87d9249..61b40f8 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -439,7 +439,7 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], last_tokens_data: Array[llama_token], last_tokens_size: c_int, penalty: c_float, @@ -462,7 +462,7 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], last_tokens_data: Array[llama_token], last_tokens_size: c_int, alpha_frequency: c_float, @@ -504,7 +504,7 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_k( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], k: c_int, min_keep: c_size_t = c_size_t(1), ): @@ -523,7 +523,7 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_p( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], p: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -542,7 +542,7 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], z: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -561,7 +561,7 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. def llama_sample_typical( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], p: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -578,7 +578,7 @@ _lib.llama_sample_typical.restype = None def llama_sample_temperature( - ctx: llama_context_p, candidates: _Pointer[llama_token_data], temp: c_float + ctx: llama_context_p, candidates: _Pointer[llama_token_data_array], temp: c_float ): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -599,7 +599,7 @@ _lib.llama_sample_temperature.restype = None # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], tau: c_float, eta: c_float, m: c_int, @@ -626,7 +626,7 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], tau: c_float, eta: c_float, mu: _Pointer[c_float], @@ -646,7 +646,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. def llama_sample_token_greedy( - ctx: llama_context_p, candidates: _Pointer[llama_token_data] + ctx: llama_context_p, candidates: _Pointer[llama_token_data_array] ) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -660,7 +660,7 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. def llama_sample_token( - ctx: llama_context_p, candidates: _Pointer[llama_token_data] + ctx: llama_context_p, candidates: _Pointer[llama_token_data_array] ) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From 66e28eb548974fe50aa80b8593f77cff651959c6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:00:41 -0400 Subject: [PATCH 153/443] Fix temperature bug --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c1c8847..6cd65a4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -287,7 +287,7 @@ class Llama: candidates=llama_cpp.ctypes.pointer(candidates), penalty=repeat_penalty, ) - if temp == 0.0: + if float(temp) == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), From 40501435c12578fc0bc696c2bdc0bf63d0e15650 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:04:12 -0400 Subject: [PATCH 154/443] Fix: types --- llama_cpp/llama_cpp.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 61b40f8..8ce3c89 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -141,6 +141,11 @@ LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9) # except 1d tensors +# Misc +c_float_p = POINTER(c_float) +c_uint8_p = POINTER(c_uint8) +c_size_t_p = POINTER(c_size_t) + # Functions @@ -257,7 +262,7 @@ def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_ return _lib.llama_copy_state_data(ctx, dest) -_lib.llama_copy_state_data.argtypes = [llama_context_p, POINTER(c_uint8)] +_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] _lib.llama_copy_state_data.restype = c_size_t @@ -269,7 +274,7 @@ def llama_set_state_data( return _lib.llama_set_state_data(ctx, src) -_lib.llama_set_state_data.argtypes = [llama_context_p, POINTER(c_uint8)] +_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p] _lib.llama_set_state_data.restype = c_size_t @@ -291,7 +296,7 @@ _lib.llama_load_session_file.argtypes = [ c_char_p, llama_token_p, c_size_t, - POINTER(c_size_t), + c_size_t_p, ] _lib.llama_load_session_file.restype = c_size_t @@ -340,7 +345,7 @@ _lib.llama_eval.restype = c_int def llama_tokenize( ctx: llama_context_p, text: bytes, - tokens, # type: Array[llama_token] + tokens: Array[llama_token], n_max_tokens: c_int, add_bos: c_bool, ) -> c_int: @@ -385,7 +390,7 @@ def llama_get_logits(ctx: llama_context_p): _lib.llama_get_logits.argtypes = [llama_context_p] -_lib.llama_get_logits.restype = POINTER(c_float) +_lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input @@ -395,7 +400,7 @@ def llama_get_embeddings(ctx: llama_context_p): _lib.llama_get_embeddings.argtypes = [llama_context_p] -_lib.llama_get_embeddings.restype = POINTER(c_float) +_lib.llama_get_embeddings.restype = c_float_p # Token Id -> String. Uses the vocabulary in the provided context @@ -614,7 +619,7 @@ _lib.llama_sample_token_mirostat.argtypes = [ c_float, c_float, c_int, - POINTER(c_float), + c_float_p, ] _lib.llama_sample_token_mirostat.restype = llama_token @@ -639,7 +644,7 @@ _lib.llama_sample_token_mirostat_v2.argtypes = [ llama_token_data_array_p, c_float, c_float, - POINTER(c_float), + c_float_p, ] _lib.llama_sample_token_mirostat_v2.restype = llama_token From e24c3d7447e158164397686bbecac2d22d8a75a1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:05:31 -0400 Subject: [PATCH 155/443] Prefer explicit imports --- llama_cpp/llama_cpp.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 8ce3c89..f6a71fa 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -68,11 +68,11 @@ _lib_base_name = "llama" _lib = _load_shared_library(_lib_base_name) # C types -LLAMA_FILE_VERSION = ctypes.c_int(1) +LLAMA_FILE_VERSION = c_int(1) LLAMA_FILE_MAGIC = b"ggjt" LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" LLAMA_SESSION_MAGIC = b"ggsn" -LLAMA_SESSION_VERSION = ctypes.c_int(1) +LLAMA_SESSION_VERSION = c_int(1) llama_context_p = c_void_p @@ -128,18 +128,18 @@ class llama_context_params(Structure): llama_context_params_p = POINTER(llama_context_params) -LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0) -LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( +LLAMA_FTYPE_ALL_F32 = c_int(0) +LLAMA_FTYPE_MOSTLY_F16 = c_int(1) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( 4 ) # tok_embeddings.weight and output.weight are F16 -LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors -# LLAMA_FTYPE_MOSTYL_Q4_3 = ctypes.c_int(6) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors +# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) # except 1d tensors # Misc c_float_p = POINTER(c_float) @@ -216,8 +216,8 @@ _lib.llama_model_quantize.restype = c_int # Returns 0 on success def llama_apply_lora_from_file( ctx: llama_context_p, - path_lora: ctypes.c_char_p, - path_base_model: ctypes.c_char_p, + path_lora: c_char_p, + path_base_model: c_char_p, n_threads: c_int, ) -> c_int: return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) From 3e28e0e50ccd7b579ae99b0fbe163fbed8888167 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:12:26 -0400 Subject: [PATCH 156/443] Fix: runtime type errors --- llama_cpp/llama_cpp.py | 52 ++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index f6a71fa..3b1ac1e 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -258,7 +258,9 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied -def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_t: +def llama_copy_state_data( + ctx: llama_context_p, dest # type: Array[c_uint8] +) -> c_size_t: return _lib.llama_copy_state_data(ctx, dest) @@ -282,9 +284,9 @@ _lib.llama_set_state_data.restype = c_size_t def llama_load_session_file( ctx: llama_context_p, path_session: bytes, - tokens_out: Array[llama_token], + tokens_out, # type: Array[llama_token] n_token_capacity: c_size_t, - n_token_count_out: _Pointer[c_size_t], + n_token_count_out, # type: _Pointer[c_size_t] ) -> c_size_t: return _lib.llama_load_session_file( ctx, path_session, tokens_out, n_token_capacity, n_token_count_out @@ -304,7 +306,7 @@ _lib.llama_load_session_file.restype = c_size_t def llama_save_session_file( ctx: llama_context_p, path_session: bytes, - tokens: Array[llama_token], + tokens, # type: Array[llama_token] n_token_count: c_size_t, ) -> c_size_t: return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) @@ -325,7 +327,7 @@ _lib.llama_save_session_file.restype = c_size_t # Returns 0 on success def llama_eval( ctx: llama_context_p, - tokens: Array[llama_token], + tokens, # type: Array[llama_token] n_tokens: c_int, n_past: c_int, n_threads: c_int, @@ -345,7 +347,7 @@ _lib.llama_eval.restype = c_int def llama_tokenize( ctx: llama_context_p, text: bytes, - tokens: Array[llama_token], + tokens, # type: Array[llama_token] n_max_tokens: c_int, add_bos: c_bool, ) -> c_int: @@ -444,8 +446,8 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], - last_tokens_data: Array[llama_token], + candidates, # type: _Pointer[llama_token_data_array] + last_tokens_data, # type: Array[llama_token] last_tokens_size: c_int, penalty: c_float, ): @@ -467,8 +469,8 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], - last_tokens_data: Array[llama_token], + candidates, # type: _Pointer[llama_token_data_array] + last_tokens_data, # type: Array[llama_token] last_tokens_size: c_int, alpha_frequency: c_float, alpha_presence: c_float, @@ -495,7 +497,9 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -def llama_sample_softmax(ctx: llama_context_p, candidates: _Pointer[llama_token_data]): +def llama_sample_softmax( + ctx: llama_context_p, candidates # type: _Pointer[llama_token_data] +): return _lib.llama_sample_softmax(ctx, candidates) @@ -509,7 +513,7 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_k( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] k: c_int, min_keep: c_size_t = c_size_t(1), ): @@ -528,7 +532,7 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_p( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] p: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -547,7 +551,7 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] z: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -566,7 +570,7 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. def llama_sample_typical( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] p: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -583,7 +587,9 @@ _lib.llama_sample_typical.restype = None def llama_sample_temperature( - ctx: llama_context_p, candidates: _Pointer[llama_token_data_array], temp: c_float + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + temp: c_float, ): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -604,11 +610,11 @@ _lib.llama_sample_temperature.restype = None # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] tau: c_float, eta: c_float, m: c_int, - mu: _Pointer[c_float], + mu, # type: _Pointer[c_float] ) -> llama_token: return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -631,10 +637,10 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] tau: c_float, eta: c_float, - mu: _Pointer[c_float], + mu, # type: _Pointer[c_float] ) -> llama_token: return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) @@ -651,7 +657,8 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. def llama_sample_token_greedy( - ctx: llama_context_p, candidates: _Pointer[llama_token_data_array] + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] ) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -665,7 +672,8 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. def llama_sample_token( - ctx: llama_context_p, candidates: _Pointer[llama_token_data_array] + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] ) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From b5f3e746275bf231df544c60f30b80f537195af7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:22:55 -0400 Subject: [PATCH 157/443] Add return type annotations for embeddings and logits --- llama_cpp/llama_cpp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 3b1ac1e..ccec12c 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -387,7 +387,7 @@ _lib.llama_n_embd.restype = c_int # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab -def llama_get_logits(ctx: llama_context_p): +def llama_get_logits(ctx: llama_context_p): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_logits(ctx) @@ -397,7 +397,7 @@ _lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input # shape: [n_embd] (1-dimensional) -def llama_get_embeddings(ctx: llama_context_p): +def llama_get_embeddings(ctx: llama_context_p): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_embeddings(ctx) From 98bbd1c6a8ea1f86c010583f6b1ab74996a1c751 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:23:14 -0400 Subject: [PATCH 158/443] Fix eval logits type --- llama_cpp/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6cd65a4..a643f51 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -127,7 +127,7 @@ class Llama: self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx) - self.eval_logits: Deque[List[llama_cpp.c_float]] = deque( + self.eval_logits: Deque[List[float]] = deque( maxlen=n_ctx if logits_all else 1 ) @@ -245,7 +245,7 @@ class Llama: n_vocab = llama_cpp.llama_n_vocab(self.ctx) cols = int(n_vocab) logits_view = llama_cpp.llama_get_logits(self.ctx) - logits: List[List[llama_cpp.c_float]] = [ + logits: List[List[float]] = [ [logits_view[i * cols + j] for j in range(cols)] for i in range(rows) ] self.eval_logits.extend(logits) @@ -287,7 +287,7 @@ class Llama: candidates=llama_cpp.ctypes.pointer(candidates), penalty=repeat_penalty, ) - if float(temp) == 0.0: + if float(temp.value) == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), From 79d50a29f40c4b14cd56a329ee50f269e673f277 Mon Sep 17 00:00:00 2001 From: Thomas Neu <81517187+th-neu@users.noreply.github.com> Date: Sat, 6 May 2023 01:02:59 +0200 Subject: [PATCH 159/443] Create dependabot.yml --- .github/dependabot.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..91abb11 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "weekly" From c9bb602b2682ae12c5690829fee1635fcdfc707c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 5 May 2023 23:25:53 +0000 Subject: [PATCH 160/443] Bump black from 23.1.0 to 23.3.0 Bumps [black](https://github.com/psf/black) from 23.1.0 to 23.3.0. - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/23.1.0...23.3.0) --- updated-dependencies: - dependency-name: black dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 56 +++++++++++++++++++++++++------------------------- pyproject.toml | 2 +- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/poetry.lock b/poetry.lock index a505168..129f923 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "anyio" @@ -42,37 +42,37 @@ tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy [[package]] name = "black" -version = "23.1.0" +version = "23.3.0" description = "The uncompromising code formatter." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"}, - {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"}, - {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"}, - {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"}, - {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"}, - {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"}, - {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"}, - {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"}, - {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"}, - {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"}, - {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"}, - {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"}, - {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"}, - {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"}, - {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"}, - {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"}, + {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"}, + {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"}, + {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"}, + {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"}, + {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"}, + {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"}, + {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"}, + {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"}, + {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"}, + {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"}, + {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"}, + {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"}, + {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"}, ] [package.dependencies] @@ -1458,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "aa15e57300668bd23c051b4cd87bec4c1a58dcccd2f2b4767579fea7f2c5fa41" +content-hash = "714083b7f30a677f9a358a9633970fb88b8198d50558a0b50bf311d4a209ed4c" diff --git a/pyproject.toml b/pyproject.toml index ca0346f..a164ef7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ typing-extensions = "^4.5.0" [tool.poetry.group.dev.dependencies] -black = "^23.1.0" +black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.2" mkdocstrings = {extras = ["python"], version = "^0.20.0"} From 1895c1103379156f4bd2ae895cdab080ab9cd104 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 13:18:25 +0200 Subject: [PATCH 161/443] Rename postfix to suffix to match upstream --- examples/low_level_api/Chat.py | 1 + examples/low_level_api/common.py | 4 ++-- examples/low_level_api/low_level_api_chat_cpp.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/low_level_api/Chat.py b/examples/low_level_api/Chat.py index 1015508..fcef8cd 100644 --- a/examples/low_level_api/Chat.py +++ b/examples/low_level_api/Chat.py @@ -63,6 +63,7 @@ params = GptParams( interactive=True, antiprompt=[f"{USER_NAME}:"], input_prefix=" ", + input_suffix=f"{AI_NAME}:", prompt=prompt, ) diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 6c35cc5..7a25582 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -37,6 +37,7 @@ class GptParams: prompt: str = "" path_session: str = "" input_prefix: str = " " + input_suffix: str = "" antiprompt: List[str] = field(default_factory=list) lora_adapter: str = "" @@ -64,7 +65,6 @@ class GptParams: # Set to "\nUser:" etc. # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:"" fix_prefix: str = "" - output_postfix: str = "" input_echo: bool = True, # Default instructions for Alpaca @@ -110,6 +110,7 @@ def gpt_params_parse(argv = None): parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session") parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") + parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix") parser.add_argument( "-r", "--reverse-prompt", @@ -158,7 +159,6 @@ def gpt_params_parse(argv = None): #Custom args parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix") - parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix") parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo") parser.add_argument( diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 4e129ee..72ced2b 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -527,8 +527,8 @@ n_keep = {self.params.n_keep} self.input(self.read_input()) else: print(self.params.input_prefix, end="") - self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.output_postfix}") - print(self.params.output_postfix,end="") + self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}") + print(self.params.input_suffix,end="") self.set_color(CONSOLE_COLOR_DEFAULT) try: From 9797394c81133eebb367bd0673b6c89eefd5a38e Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 13:27:52 +0200 Subject: [PATCH 162/443] Wrong logit_bias parsed type --- examples/low_level_api/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 7a25582..2bfe356 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -180,7 +180,7 @@ def gpt_params_parse(argv = None): if (logit_bias_str != None): for i in logit_bias_str: if (m := re.match(r"(\d+)([-+]\d+)", i)): - params.logit_bias[int(m.group(1))] = int(m.group(2)) + params.logit_bias[int(m.group(1))] = float(m.group(2)) return params From 3ceb47b597a8819db3afa851df4ae3211f2cb680 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 13:35:50 +0200 Subject: [PATCH 163/443] Fix mirastat requiring c_float --- .../low_level_api/low_level_api_chat_cpp.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 72ced2b..55b24cd 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -357,7 +357,7 @@ n_keep = {self.params.n_keep} # Apply params.logit_bias map for key, value in self.params.logit_bias.items(): - logits[key] += value + logits[key] += llama_cpp.c_float(value) _arr = (llama_cpp.llama_token_data * n_vocab)(*[ llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) @@ -372,14 +372,14 @@ n_keep = {self.params.n_keep} _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:]) llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p, _arr, - last_n_repeat, self.params.repeat_penalty) + last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty)) llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p, _arr, - last_n_repeat, self.params.frequency_penalty, self.params.presence_penalty) + last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty)) if not self.params.penalize_nl: logits[llama_cpp.llama_token_nl()] = nl_logit - + if self.params.temp <= 0: # Greedy sampling id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p) @@ -387,19 +387,19 @@ n_keep = {self.params.n_keep} if self.params.mirostat == 1: mirostat_mu = 2.0 * self.params.mirostat_tau mirostat_m = 100 - llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) - id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_m, mirostat_mu) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu)) elif self.params.mirostat == 2: mirostat_mu = 2.0 * self.params.mirostat_tau - llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) - id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_mu) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu)) else: # Temperature sampling llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k) - llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, self.params.tfs_z) - llama_cpp.llama_sample_typical(self.ctx, candidates_p, self.params.typical_p) - llama_cpp.llama_sample_top_p(self.ctx, candidates_p, self.params.top_p) - llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z)) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p)) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p)) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) id = llama_cpp.llama_sample_token(self.ctx, candidates_p) # print("`{}`".format(candidates_p.size)) From 996f63e9e1804b2d9a91c5081665ea536a85542f Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 15:16:58 +0200 Subject: [PATCH 164/443] Add utf8 to chat example --- examples/low_level_api/common.py | 2 +- .../low_level_api/low_level_api_chat_cpp.py | 73 +++++++------- examples/low_level_api/util.py | 95 +++++++++++++++++++ 3 files changed, 130 insertions(+), 40 deletions(-) create mode 100644 examples/low_level_api/util.py diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 2bfe356..55d08db 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -102,7 +102,7 @@ def gpt_params_parse(argv = None): parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z") parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty") parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat") - parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau",dest="mirostat_tau") + parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau") parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta") parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model") diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 55b24cd..9a9bc01 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -17,34 +17,7 @@ from os import cpu_count, path import llama_cpp from common import GptParams, gpt_params_parse, gpt_random_prompt - -ANSI_COLOR_RESET = "\x1b[0m" -ANSI_COLOR_YELLOW = "\x1b[33m" -ANSI_BOLD = "\x1b[1m" -ANSI_COLOR_GREEN = "\x1b[32m" - -CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET -CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW -CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN - -# Iterative search -# Actively searches and prevents a pattern from being returned -class IterSearch: - def __init__(self, pattern): - self.pattern = list(pattern) - self.buffer = [] - - def __call__(self, char): - self.buffer += [char] - - if (self.pattern[:len(self.buffer)] == self.buffer): - if (len(self.buffer) >= len(self.pattern)): - self.buffer.clear() - return [] - - _tmp = self.buffer[:] - self.buffer.clear() - return _tmp +import util # A LLaMA interactive session class LLaMAInteract: @@ -82,6 +55,7 @@ specified) expect poor results""", file=sys.stderr) self.first_antiprompt = [] self.remaining_tokens = self.params.n_predict self.output_echo = self.params.input_echo + self.multibyte_fix = [] # model load self.lparams = llama_cpp.llama_context_default_params() @@ -188,7 +162,7 @@ specified) expect poor results""", file=sys.stderr) self.params.interactive_start = True _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False) self.first_antiprompt.append(_ptn) - self.antiecho = IterSearch(_ptn) + self.antiecho = util.IterSearch(_ptn) # enable interactive mode if reverse prompt or interactive start is specified if (len(self.params.antiprompt) != 0 or self.params.interactive_start): @@ -256,14 +230,14 @@ n_keep = {self.params.n_keep} - If you want to submit another line, end your input in '\\'. """, file=sys.stderr) - self.set_color(CONSOLE_COLOR_PROMPT) + self.set_color(util.CONSOLE_COLOR_PROMPT) self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) # tokenize a prompt def _tokenize(self, prompt, bos=True): - _arr = (llama_cpp.llama_token * (len(prompt) + 1))() + _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))() _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos) return _arr[:_n] @@ -295,7 +269,6 @@ n_keep = {self.params.n_keep} self.params.path_session = "" # try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) - # REVIEW if self.n_session_consumed < len(self.session_tokens): for i in range(len(self.embd)): if self.embd[i] != self.session_tokens[self.n_session_consumed]: @@ -445,7 +418,7 @@ n_keep = {self.params.n_keep} # reset color to default if we there is no pending user input if (self.params.input_echo and len(self.embd_inp) == self.input_consumed): - self.set_color(CONSOLE_COLOR_DEFAULT) + self.set_color(util.CONSOLE_COLOR_DEFAULT) if (self.params.interactive and len(self.embd_inp) <= self.input_consumed): # if antiprompt is present, stop @@ -486,12 +459,12 @@ n_keep = {self.params.n_keep} def exit(self): llama_cpp.llama_free(self.ctx) - self.set_color(CONSOLE_COLOR_DEFAULT) + self.set_color(util.CONSOLE_COLOR_DEFAULT) # return past text def past(self): for id in self.last_n_tokens[-self.n_past:]: - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore") + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf8", errors="ignore") # write input def input(self, prompt: str): @@ -505,7 +478,29 @@ n_keep = {self.params.n_keep} def output(self): self.remaining_tokens = self.params.n_predict for id in self.generate(): - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + cur_char = llama_cpp.llama_token_to_str(self.ctx, id) + + # Add remainder of missing bytes + if None in self.multibyte_fix: + self.multibyte_fix[self.multibyte_fix.index(None)] = cur_char + + # Return completed utf char + if len(self.multibyte_fix) > 0 and not None in self.multibyte_fix: + yield (b"".join(self.multibyte_fix)).decode("utf8") + self.multibyte_fix = [] + continue + + # Contains multi-byte UTF8 + for num, pattern in [(2, 192), (3, 224), (4, 240)]: + # Bitwise AND check + if pattern & int.from_bytes(cur_char) == pattern: + self.multibyte_fix = [cur_char] + ([None] * (num-1)) + + # Stop incomplete bytes from passing + if len(self.multibyte_fix) > 0: + continue + + yield cur_char.decode("utf8") # read user input def read_input(self): @@ -521,7 +516,7 @@ n_keep = {self.params.n_keep} self.params.input_echo = False while self.params.interactive: - self.set_color(CONSOLE_COLOR_USER_INPUT) + self.set_color(util.CONSOLE_COLOR_USER_INPUT) if (self.params.instruct): print('\n> ', end="") self.input(self.read_input()) @@ -529,13 +524,13 @@ n_keep = {self.params.n_keep} print(self.params.input_prefix, end="") self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}") print(self.params.input_suffix,end="") - self.set_color(CONSOLE_COLOR_DEFAULT) + self.set_color(util.CONSOLE_COLOR_DEFAULT) try: for i in self.output(): print(i,end="",flush=True) except KeyboardInterrupt: - self.set_color(CONSOLE_COLOR_DEFAULT) + self.set_color(util.CONSOLE_COLOR_DEFAULT) if not self.params.instruct: print(self.params.fix_prefix,end="") self.input(self.params.fix_prefix) diff --git a/examples/low_level_api/util.py b/examples/low_level_api/util.py new file mode 100644 index 0000000..9d0ec2f --- /dev/null +++ b/examples/low_level_api/util.py @@ -0,0 +1,95 @@ + +ANSI_COLOR_RESET = "\x1b[0m" +ANSI_COLOR_YELLOW = "\x1b[33m" +ANSI_BOLD = "\x1b[1m" +ANSI_COLOR_GREEN = "\x1b[32m" + +CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET +CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW +CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN + +# Iterative search +# Actively searches and prevents a pattern from being returned +class IterSearch: + def __init__(self, pattern): + self.pattern = list(pattern) + self.buffer = [] + + def __call__(self, char): + self.buffer += [char] + + if (self.pattern[:len(self.buffer)] == self.buffer): + if (len(self.buffer) >= len(self.pattern)): + self.buffer.clear() + return [] + + _tmp = self.buffer[:] + self.buffer.clear() + return _tmp + +class Circle: + def __init__(self, size, default=0): + self.list = [default] * size + self.maxsize = size + self.size = 0 + self.offset = 0 + + def append(self, elem): + if self.size < self.maxsize: + self.list[self.size] = elem + self.size += 1 + else: + self.list[self.offset] = elem + self.offset = (self.offset + 1) % self.maxsize + + def __getitem__(self, val): + if isinstance(val, int): + if 0 > val or val >= self.size: + raise IndexError('Index out of range') + return self.list[val] if self.size < self.maxsize else self.list[(self.offset + val) % self.maxsize] + elif isinstance(val, slice): + start, stop, step = val.start, val.stop, val.step + if step is None: + step = 1 + if start is None: + start = 0 + if stop is None: + stop = self.size + if start < 0: + start = self.size + start + if stop < 0: + stop = self.size + stop + + indices = range(start, stop, step) + return [self.list[(self.offset + i) % self.maxsize] for i in indices if i < self.size] + else: + raise TypeError('Invalid argument type') + + + + +if __name__ == "__main__": + c = Circle(5) + + c.append(1) + print(c.list) + print(c[:]) + assert c[0] == 1 + assert c[:5] == [1] + + for i in range(2,5+1): + c.append(i) + print(c.list) + print(c[:]) + assert c[0] == 1 + assert c[:5] == [1,2,3,4,5] + + for i in range(5+1,9+1): + c.append(i) + print(c.list) + print(c[:]) + assert c[0] == 5 + assert c[:5] == [5,6,7,8,9] + #assert c[:-5] == [5,6,7,8,9] + assert c[:10] == [5,6,7,8,9] + From fd80ddf703373f523bda4e62d24564fa8930f670 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 22:22:28 +0200 Subject: [PATCH 165/443] Fix a bug with wrong type --- examples/low_level_api/low_level_api_chat_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 9a9bc01..272b454 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -330,7 +330,7 @@ n_keep = {self.params.n_keep} # Apply params.logit_bias map for key, value in self.params.logit_bias.items(): - logits[key] += llama_cpp.c_float(value) + logits[key] += value _arr = (llama_cpp.llama_token_data * n_vocab)(*[ llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) From aa203a0d651351f4c1604e3482f7a3a8ca1eabf7 Mon Sep 17 00:00:00 2001 From: Maximilian Winter Date: Sat, 6 May 2023 22:47:47 +0200 Subject: [PATCH 166/443] Added mirostat sampling to the high level API. --- llama_cpp/llama.py | 84 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index a643f51..8d5df10 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -257,6 +257,11 @@ class Llama: top_k: llama_cpp.c_int, top_p: llama_cpp.c_float, temp: llama_cpp.c_float, + mirostat_mode: llama_cpp.c_int, + mirostat_tau: llama_cpp.c_float, + mirostat_eta: llama_cpp.c_float, + mirostat_mu: llama_cpp.c_float, + mirostat_m: llama_cpp.c_int, repeat_penalty: llama_cpp.c_float, ): assert self.ctx is not None @@ -287,7 +292,34 @@ class Llama: candidates=llama_cpp.ctypes.pointer(candidates), penalty=repeat_penalty, ) - if float(temp.value) == 0.0: + if mirostat_mode == 1: + llama_cpp.llama_sample_temperature( + ctx=self.ctx, + candidates=llama_cpp.ctypes.pointer(candidates), + temp=temp, + ) + llama_cpp.llama_sample_token_mirostat( + ctx=self.ctx, + candidates=llama_cpp.ctypes.pointer(candidates), + tau=mirostat_tau, + eta=mirostat_eta, + mu=mirostat_mu, + m=mirostat_m + ) + elif mirostat_mode == 2: + llama_cpp.llama_sample_temperature( + ctx=self.ctx, + candidates=llama_cpp.ctypes.pointer(candidates), + temp=temp, + ) + llama_cpp.llama_sample_token_mirostat_v2( + ctx=self.ctx, + candidates=llama_cpp.ctypes.pointer(candidates), + tau=mirostat_tau, + eta=mirostat_eta, + mu=mirostat_mu + ) + elif float(temp.value) == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), @@ -328,6 +360,11 @@ class Llama: top_k: int, top_p: float, temp: float, + mirostat_mode: int, + mirostat_tau: float, + mirostat_eta: float, + mirostat_mu: float, + mirostat_m: int, repeat_penalty: float, ): """Sample a token from the model. @@ -353,6 +390,11 @@ class Llama: top_k=llama_cpp.c_int(top_k), top_p=llama_cpp.c_float(top_p), temp=llama_cpp.c_float(temp), + mirostat=llama_cpp.c_int(mirostat_mode), + mirostat_mu=llama_cpp.c_float(mirostat_mu), + mirostat_tau=llama_cpp.c_float(mirostat_tau), + mirostat_eta=llama_cpp.c_float(mirostat_eta), + mirostat_m=llama_cpp.c_int(mirostat_m), repeat_penalty=llama_cpp.c_float(repeat_penalty), ) @@ -362,6 +404,11 @@ class Llama: top_k: int, top_p: float, temp: float, + mirostat: int, + mirostat_tau: float, + mirostat_eta: float, + mirostat_mu: float, + mirostat_m: int, repeat_penalty: float, reset: bool = True, ) -> Generator[ @@ -416,6 +463,11 @@ class Llama: top_k=top_k, top_p=top_p, temp=temp, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + mirostat_mu=mirostat_mu, + mirostat_m=mirostat_m, repeat_penalty=repeat_penalty, ) tokens_or_none = yield token @@ -486,6 +538,11 @@ class Llama: suffix: Optional[str] = None, max_tokens: int = 16, temperature: float = 0.8, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + mirostat_mu: float = 10, + mirostat_m: int = 100, top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, @@ -536,6 +593,11 @@ class Llama: top_k=top_k, top_p=top_p, temp=temperature, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + mirostat_mu=mirostat_mu, + mirostat_m=mirostat_m, repeat_penalty=repeat_penalty, ): if token == llama_cpp.llama_token_eos(): @@ -707,6 +769,11 @@ class Llama: suffix: Optional[str] = None, max_tokens: int = 128, temperature: float = 0.8, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + mirostat_mu: float = 10, + mirostat_m: int = 100, top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, @@ -742,6 +809,11 @@ class Llama: suffix=suffix, max_tokens=max_tokens, temperature=temperature, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + mirostat_mu=mirostat_mu, + mirostat_m=mirostat_m, top_p=top_p, logprobs=logprobs, echo=echo, @@ -762,6 +834,11 @@ class Llama: suffix: Optional[str] = None, max_tokens: int = 128, temperature: float = 0.8, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + mirostat_mu: float = 10, + mirostat_m: int = 100, top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, @@ -797,6 +874,11 @@ class Llama: suffix=suffix, max_tokens=max_tokens, temperature=temperature, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + mirostat_mu=mirostat_mu, + mirostat_m=mirostat_m, top_p=top_p, logprobs=logprobs, echo=echo, From fdcab2286c8d9e91779590d6facb3aee34456169 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 6 May 2023 21:11:57 +0000 Subject: [PATCH 167/443] Bump mkdocs-material from 9.1.4 to 9.1.9 Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.4 to 9.1.9. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.4...9.1.9) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 129f923..287d05e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -792,14 +792,14 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.4" +version = "9.1.9" description = "Documentation that simply works" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.4-py3-none-any.whl", hash = "sha256:4c92dcf9365068259bef3eed8e0dd5410056b6f7187bdea2d52848c0f94cd94c"}, - {file = "mkdocs_material-9.1.4.tar.gz", hash = "sha256:c3a8943e9e4a7d2624291da365bbccf0b9f88688aa6947a46260d8c165cd4389"}, + {file = "mkdocs_material-9.1.9-py3-none-any.whl", hash = "sha256:7db24261cb17400e132c46d17eea712bfe71056d892a9beba32cf68210297141"}, + {file = "mkdocs_material-9.1.9.tar.gz", hash = "sha256:74d8da1371ab3a326868fe47bae3cbc4aa22e93c048b4ca5117e6817b88bd734"}, ] [package.dependencies] @@ -1458,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "714083b7f30a677f9a358a9633970fb88b8198d50558a0b50bf311d4a209ed4c" +content-hash = "a921481e74f47e925f7ec2814fa0bc2e07707cb36fd12d9b33ecc6b0402a27c8" diff --git a/pyproject.toml b/pyproject.toml index a164ef7..55ca8ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.2" mkdocstrings = {extras = ["python"], version = "^0.20.0"} -mkdocs-material = "^9.1.4" +mkdocs-material = "^9.1.9" pytest = "^7.2.2" httpx = "^0.24.0" From 2a21b8f69e7049f03a4ab3e0b5ec51d81456a796 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 6 May 2023 21:16:08 +0000 Subject: [PATCH 168/443] Bump mkdocs from 1.4.2 to 1.4.3 Bumps [mkdocs](https://github.com/mkdocs/mkdocs) from 1.4.2 to 1.4.3. - [Release notes](https://github.com/mkdocs/mkdocs/releases) - [Commits](https://github.com/mkdocs/mkdocs/compare/1.4.2...1.4.3) --- updated-dependencies: - dependency-name: mkdocs dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 287d05e..d30dc8f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -747,14 +747,14 @@ files = [ [[package]] name = "mkdocs" -version = "1.4.2" +version = "1.4.3" description = "Project documentation with Markdown." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs-1.4.2-py3-none-any.whl", hash = "sha256:c8856a832c1e56702577023cd64cc5f84948280c1c0fcc6af4cd39006ea6aa8c"}, - {file = "mkdocs-1.4.2.tar.gz", hash = "sha256:8947af423a6d0facf41ea1195b8e1e8c85ad94ac95ae307fe11232e0424b11c5"}, + {file = "mkdocs-1.4.3-py3-none-any.whl", hash = "sha256:6ee46d309bda331aac915cd24aab882c179a933bd9e77b80ce7d2eaaa3f689dd"}, + {file = "mkdocs-1.4.3.tar.gz", hash = "sha256:5955093bbd4dd2e9403c5afaf57324ad8b04f16886512a3ee6ef828956481c57"}, ] [package.dependencies] @@ -1458,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "a921481e74f47e925f7ec2814fa0bc2e07707cb36fd12d9b33ecc6b0402a27c8" +content-hash = "f2de41d10587a7f21e4891584de2c7152dfa6f75809144778b2dc34d93395abe" diff --git a/pyproject.toml b/pyproject.toml index 55ca8ce..1f79b74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ typing-extensions = "^4.5.0" [tool.poetry.group.dev.dependencies] black = "^23.3.0" twine = "^4.0.2" -mkdocs = "^1.4.2" +mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.20.0"} mkdocs-material = "^9.1.9" pytest = "^7.2.2" From 515d9bde7eb914993b267f81dd5b438f0dc3bead Mon Sep 17 00:00:00 2001 From: Maximilian Winter Date: Sat, 6 May 2023 23:40:19 +0200 Subject: [PATCH 169/443] Fixed somethings and activated cublas --- llama_cpp/llama.py | 4 ++-- setup.py | 1 + vendor/llama.cpp | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 8d5df10..d648450 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -390,7 +390,7 @@ class Llama: top_k=llama_cpp.c_int(top_k), top_p=llama_cpp.c_float(top_p), temp=llama_cpp.c_float(temp), - mirostat=llama_cpp.c_int(mirostat_mode), + mirostat_mode=llama_cpp.c_int(mirostat_mode), mirostat_mu=llama_cpp.c_float(mirostat_mu), mirostat_tau=llama_cpp.c_float(mirostat_tau), mirostat_eta=llama_cpp.c_float(mirostat_eta), @@ -404,7 +404,7 @@ class Llama: top_k: int, top_p: float, temp: float, - mirostat: int, + mirostat_mode: int, mirostat_tau: float, mirostat_eta: float, mirostat_mu: float, diff --git a/setup.py b/setup.py index 405886a..0b90312 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ setup( extras_require={ "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], }, + cmake_args=['-DLLAMA_CUBLAS=ON'], python_requires=">=3.7", classifiers=[ "Programming Language :: Python :: 3", diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2edbdb0..173d0e6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2edbdb0f99336cb41f0995061c7602ed54beb863 +Subproject commit 173d0e6419e8f8f3c1f4f13201b777f4c60629f3 From 33d41fb8f3f949e29d4038fdf542ee8445af190a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 7 May 2023 00:07:39 +0000 Subject: [PATCH 170/443] Bump pytest from 7.2.2 to 7.3.1 Bumps [pytest](https://github.com/pytest-dev/pytest) from 7.2.2 to 7.3.1. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/7.2.2...7.3.1) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 30 +++++------------------------- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 26 deletions(-) diff --git a/poetry.lock b/poetry.lock index d30dc8f..0bd08d5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -21,25 +21,6 @@ doc = ["packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] test = ["contextlib2", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (<0.15)", "uvloop (>=0.15)"] trio = ["trio (>=0.16,<0.22)"] -[[package]] -name = "attrs" -version = "22.2.0" -description = "Classes Without Boilerplate" -category = "dev" -optional = false -python-versions = ">=3.6" -files = [ - {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"}, - {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"}, -] - -[package.extras] -cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] -dev = ["attrs[docs,tests]"] -docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"] -tests = ["attrs[tests-no-zope]", "zope.interface"] -tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] - [[package]] name = "black" version = "23.3.0" @@ -1007,18 +988,17 @@ pyyaml = "*" [[package]] name = "pytest" -version = "7.2.2" +version = "7.3.1" description = "pytest: simple powerful testing with Python" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.2.2-py3-none-any.whl", hash = "sha256:130328f552dcfac0b1cec75c12e3f005619dc5f874f0a06e8ff7263f0ee6225e"}, - {file = "pytest-7.2.2.tar.gz", hash = "sha256:c99ab0c73aceb050f68929bc93af19ab6db0558791c6a0715723abe9d0ade9d4"}, + {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, + {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, ] [package.dependencies] -attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" @@ -1027,7 +1007,7 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] [[package]] name = "python-dateutil" @@ -1458,4 +1438,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "f2de41d10587a7f21e4891584de2c7152dfa6f75809144778b2dc34d93395abe" +content-hash = "e02fff3d4a50fbc9a89f6f001409a5f066c26a341c2d5f2dfbfb32f07e711eca" diff --git a/pyproject.toml b/pyproject.toml index 1f79b74..6f83611 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.20.0"} mkdocs-material = "^9.1.9" -pytest = "^7.2.2" +pytest = "^7.3.1" httpx = "^0.24.0" [build-system] From ae3c639764359890e692776cfb87ff84b911532f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 7 May 2023 00:16:31 +0000 Subject: [PATCH 171/443] Bump mkdocstrings from 0.20.0 to 0.21.2 Bumps [mkdocstrings](https://github.com/mkdocstrings/mkdocstrings) from 0.20.0 to 0.21.2. - [Release notes](https://github.com/mkdocstrings/mkdocstrings/releases) - [Changelog](https://github.com/mkdocstrings/mkdocstrings/blob/master/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/mkdocstrings/compare/0.20.0...0.21.2) --- updated-dependencies: - dependency-name: mkdocstrings dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 9 +++++---- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 0bd08d5..5b364a7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -808,14 +808,14 @@ files = [ [[package]] name = "mkdocstrings" -version = "0.20.0" +version = "0.21.2" description = "Automatic documentation from sources, for MkDocs." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocstrings-0.20.0-py3-none-any.whl", hash = "sha256:f17fc2c4f760ec302b069075ef9e31045aa6372ca91d2f35ded3adba8e25a472"}, - {file = "mkdocstrings-0.20.0.tar.gz", hash = "sha256:c757f4f646d4f939491d6bc9256bfe33e36c5f8026392f49eaa351d241c838e5"}, + {file = "mkdocstrings-0.21.2-py3-none-any.whl", hash = "sha256:949ef8da92df9d692ca07be50616459a6b536083a25520fd54b00e8814ce019b"}, + {file = "mkdocstrings-0.21.2.tar.gz", hash = "sha256:304e56a2e90595708a38a13a278e538a67ad82052dd5c8b71f77a604a4f3d911"}, ] [package.dependencies] @@ -826,6 +826,7 @@ mkdocs = ">=1.2" mkdocs-autorefs = ">=0.3.1" mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""} pymdown-extensions = ">=6.3" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.10\""} [package.extras] crystal = ["mkdocstrings-crystal (>=0.3.4)"] @@ -1438,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "e02fff3d4a50fbc9a89f6f001409a5f066c26a341c2d5f2dfbfb32f07e711eca" +content-hash = "e87403dcd0a0b8484436b02c392326adfaf22b8d7e182d77e4a155c67a7435bc" diff --git a/pyproject.toml b/pyproject.toml index 6f83611..a11faef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ typing-extensions = "^4.5.0" black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" -mkdocstrings = {extras = ["python"], version = "^0.20.0"} +mkdocstrings = {extras = ["python"], version = "^0.21.2"} mkdocs-material = "^9.1.9" pytest = "^7.3.1" httpx = "^0.24.0" From bc853e3742fd2a4718bd66bd501bdb5ede50f6d3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 6 May 2023 21:32:50 -0400 Subject: [PATCH 172/443] Fix type for eval_logits in LlamaState object --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index a643f51..fc91ea4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -51,7 +51,7 @@ class LlamaState: def __init__( self, eval_tokens: Deque[llama_cpp.llama_token], - eval_logits: Deque[List[llama_cpp.c_float]], + eval_logits: Deque[List[float]], llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] llama_state_size: llama_cpp.c_size_t, ): From c76e0913bbc6a039f5456ca44f4d84966e5c14fd Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 6 May 2023 22:18:31 -0400 Subject: [PATCH 173/443] Update issue templates --- .github/ISSUE_TEMPLATE/bug_report.md | 80 +++++++++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.md | 20 ++++++ 2 files changed, 100 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..b8e33e5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,80 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +# Prerequisites + +Please answer the following questions for yourself before submitting an issue. + +- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now. +- [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md). +- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed). +- [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share. + +# Expected Behavior + +Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do. + +# Current Behavior + +Please provide a detailed written description of what `llama-cpp-python` did, instead. + +# Environment and Context + +Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions. + +* Physical (or virtual) hardware you are using, e.g. for Linux: + +`$ lscpu` + +* Operating System, e.g. for Linux: + +`$ uname -a` + +* SDK version, e.g. for Linux: + +``` +$ python3 --version +$ make --version +$ g++ --version +``` + +# Failure Information (for bugs) + +Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template. + +# Steps to Reproduce + +Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better. + +1. step 1 +2. step 2 +3. step 3 +4. etc. + +**Note: Many issues seem to be regarding performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.** + +# Failure Logs + +Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes. + +Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability. + +Example environment info: +``` +llama-cpp-python$ git log | head -1 +commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2 + +llama-cpp-python$ python3 --version +Python 3.10.10 + +llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette" +fastapi 0.95.0 +sse-starlette 1.3.3 +uvicorn 0.21.1 +``` diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..bbcbbe7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. From 7c3743fe5f2781a8aab9ba8e15f4d250963747cf Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 00:12:47 -0400 Subject: [PATCH 174/443] Update llama.cpp --- llama_cpp/llama.py | 4 ++++ llama_cpp/llama_cpp.py | 8 ++++---- vendor/llama.cpp | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index fc91ea4..0db5c10 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -297,21 +297,25 @@ class Llama: ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), k=top_k, + min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_tail_free( ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), z=llama_cpp.c_float(1.0), + min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_typical( ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), p=llama_cpp.c_float(1.0), + min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_top_p( ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), p=top_p, + min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_temperature( ctx=self.ctx, diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index ccec12c..527ed7c 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -515,7 +515,7 @@ def llama_sample_top_k( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] k: c_int, - min_keep: c_size_t = c_size_t(1), + min_keep: c_size_t, ): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -534,7 +534,7 @@ def llama_sample_top_p( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] p: c_float, - min_keep: c_size_t = c_size_t(1), + min_keep: c_size_t, ): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -553,7 +553,7 @@ def llama_sample_tail_free( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] z: c_float, - min_keep: c_size_t = c_size_t(1), + min_keep: c_size_t, ): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -572,7 +572,7 @@ def llama_sample_typical( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] p: c_float, - min_keep: c_size_t = c_size_t(1), + min_keep: c_size_t, ): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2edbdb0..1b0fd45 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2edbdb0f99336cb41f0995061c7602ed54beb863 +Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25 From 397ae97f64bb235db5a773a63caaeea5b258a20c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 01:41:19 -0400 Subject: [PATCH 175/443] Update README --- README.md | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c46fa11..9daca60 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,10 @@ You can force the use of `cmake` on Linux / MacOS setting the `FORCE_CMAKE=1` en ## High-level API +The high-level API provides a simple managed interface through the `Llama` class. + +Below is a short example demonstrating how to use the high-level API to generate text: + ```python >>> from llama_cpp import Llama >>> llm = Llama(model_path="./models/7B/ggml-model.bin") @@ -90,8 +94,25 @@ docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml- ## Low-level API -The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`. -The entire API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and should mirror [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). +The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`. +The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). + +Below is a short example demonstrating how to use the low-level API to tokenize a prompt: + +```python +>>> import llama_cpp +>>> import ctypes +>>> params = llama_cpp.llama_context_default_params() +# use bytes for char * params +>>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params) +>>> max_tokens = params.n_ctx +# use ctypes arrays for array params +>>> tokens = (llama_cppp.llama_token * int(max_tokens))() +>>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) +>>> llama_cpp.llama_free(ctx) +``` + +Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API. # Documentation From c382d8f86a628edec4427ac01687babb5c4aa35f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 02:00:22 -0400 Subject: [PATCH 176/443] Revert "llama_cpp server: mark model as required" This reverts commit e40fcb05754d0ec9c65359e245a436794cbfefdb. --- llama_cpp/server/app.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 595476f..0b7b1b2 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -149,8 +149,15 @@ class CreateCompletionRequest(BaseModel): description="The number of logprobs to generate. If None, no logprobs are generated." ) - # ignored, but marked as required for the sake of compatibility with openai's api - model: str = model_field + # ignored or currently unsupported + model: Optional[str] = model_field + n: Optional[int] = 1 + logprobs: Optional[int] = Field(None) + presence_penalty: Optional[float] = 0 + frequency_penalty: Optional[float] = 0 + best_of: Optional[int] = 1 + logit_bias: Optional[Dict[str, float]] = Field(None) + user: Optional[str] = Field(None) # llama.cpp specific parameters top_k: int = top_k_field @@ -190,11 +197,11 @@ def create_completion( class CreateEmbeddingRequest(BaseModel): - # ignored, but marked as required for the sake of compatibility with openai's api - model: str = model_field + model: Optional[str] = model_field input: str = Field( description="The input to embed." ) + user: Optional[str] class Config: schema_extra = { @@ -235,8 +242,13 @@ class CreateChatCompletionRequest(BaseModel): stop: Optional[List[str]] = stop_field stream: bool = stream_field - # ignored, but marked as required for the sake of compatibility with openai's api - model: str = model_field + # ignored or currently unsupported + model: Optional[str] = model_field + n: Optional[int] = 1 + presence_penalty: Optional[float] = 0 + frequency_penalty: Optional[float] = 0 + logit_bias: Optional[Dict[str, float]] = Field(None) + user: Optional[str] = Field(None) # llama.cpp specific parameters top_k: int = top_k_field From 86753976c4ce1289a784b7385f419f471f7e8a50 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 02:02:34 -0400 Subject: [PATCH 177/443] Revert "llama_cpp server: delete some ignored / unused parameters" This reverts commit b47b9549d57f146a00ee19cd7d2bb294111abb67. --- llama_cpp/llama_types.py | 2 ++ llama_cpp/server/app.py | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index b770a01..bfc7342 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -60,6 +60,8 @@ class Completion(TypedDict): class ChatCompletionMessage(TypedDict): role: Literal["assistant", "user", "system"] content: str + user: NotRequired[str] + class ChatCompletionChoice(TypedDict): index: int diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 0b7b1b2..ba2ca2f 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -185,7 +185,13 @@ def create_completion( completion_or_chunks = llama( **request.dict( exclude={ - "model" + "model", + "n", + "frequency_penalty", + "presence_penalty", + "best_of", + "logit_bias", + "user", } ) ) @@ -221,7 +227,7 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - return llama.create_embedding(**request.dict(exclude={"model"})) + return llama.create_embedding(**request.dict(exclude={"model", "user"})) class ChatCompletionRequestMessage(BaseModel): @@ -283,7 +289,12 @@ def create_chat_completion( completion_or_chunks = llama.create_chat_completion( **request.dict( exclude={ - "model" + "model", + "n", + "presence_penalty", + "frequency_penalty", + "logit_bias", + "user", } ), ) From 1a00e452ea1e82232ffc035647b1c56116ae62ea Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 02:52:20 -0400 Subject: [PATCH 178/443] Update settings fields and defaults --- llama_cpp/server/app.py | 94 ++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ba2ca2f..48dfc5e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -13,18 +13,41 @@ from sse_starlette.sse import EventSourceResponse class Settings(BaseSettings): - model: str - n_ctx: int = 2048 - n_batch: int = 512 - n_threads: int = max((os.cpu_count() or 2) // 2, 1) - f16_kv: bool = True - use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... - use_mmap: bool = True - embedding: bool = True - last_n_tokens_size: int = 64 - logits_all: bool = False - cache: bool = False # WARNING: This is an experimental feature - vocab_only: bool = False + model: str = Field( + description="The path to the model to use for generating completions." + ) + n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_batch: int = Field( + default=512, ge=1, description="The batch size to use per eval." + ) + n_threads: int = Field( + default=max((os.cpu_count() or 2) // 2, 1), + ge=1, + description="The number of threads to use.", + ) + f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") + use_mlock: bool = Field( + default=bool(llama_cpp.llama_mlock_supported().value), + description="Use mlock.", + ) + use_mmap: bool = Field( + default=bool(llama_cpp.llama_mmap_supported().value), + description="Use mmap.", + ) + embedding: bool = Field(default=True, description="Whether to use embeddings.") + last_n_tokens_size: int = Field( + default=64, + ge=0, + description="Last n tokens to keep for repeat penalty calculation.", + ) + logits_all: bool = Field(default=True, description="Whether to return logits.") + cache: bool = Field( + default=False, + description="Use a cache to reduce processing times for evaluated prompts.", + ) + vocab_only: bool = Field( + default=False, description="Whether to only return the vocabulary." + ) router = APIRouter() @@ -74,79 +97,75 @@ def get_llama(): with llama_lock: yield llama -model_field = Field( - description="The model to use for generating completions." -) + +model_field = Field(description="The model to use for generating completions.") max_tokens_field = Field( - default=16, - ge=1, - le=2048, - description="The maximum number of tokens to generate." + default=16, ge=1, le=2048, description="The maximum number of tokens to generate." ) temperature_field = Field( default=0.8, ge=0.0, le=2.0, - description="Adjust the randomness of the generated text.\n\n" + - "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run." + description="Adjust the randomness of the generated text.\n\n" + + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.", ) top_p_field = Field( default=0.95, ge=0.0, le=1.0, - description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + - "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text." + description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.", ) stop_field = Field( default=None, - description="A list of tokens at which to stop generation. If None, no stop tokens are used." + description="A list of tokens at which to stop generation. If None, no stop tokens are used.", ) stream_field = Field( default=False, - description="Whether to stream the results as they are generated. Useful for chatbots." + description="Whether to stream the results as they are generated. Useful for chatbots.", ) top_k_field = Field( default=40, ge=0, - description="Limit the next token selection to the K most probable tokens.\n\n" + - "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text." + description="Limit the next token selection to the K most probable tokens.\n\n" + + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.", ) repeat_penalty_field = Field( default=1.0, ge=0.0, - description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + - "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient." + description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.", ) + class CreateCompletionRequest(BaseModel): prompt: Optional[str] = Field( - default="", - description="The prompt to generate completions for." + default="", description="The prompt to generate completions for." ) suffix: Optional[str] = Field( default=None, - description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots." + description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.", ) max_tokens: int = max_tokens_field temperature: float = temperature_field top_p: float = top_p_field echo: bool = Field( default=False, - description="Whether to echo the prompt in the generated text. Useful for chatbots." + description="Whether to echo the prompt in the generated text. Useful for chatbots.", ) stop: Optional[List[str]] = stop_field stream: bool = stream_field logprobs: Optional[int] = Field( default=None, ge=0, - description="The number of logprobs to generate. If None, no logprobs are generated." + description="The number of logprobs to generate. If None, no logprobs are generated.", ) # ignored or currently unsupported @@ -204,9 +223,7 @@ def create_completion( class CreateEmbeddingRequest(BaseModel): model: Optional[str] = model_field - input: str = Field( - description="The input to embed." - ) + input: str = Field(description="The input to embed.") user: Optional[str] class Config: @@ -239,8 +256,7 @@ class ChatCompletionRequestMessage(BaseModel): class CreateChatCompletionRequest(BaseModel): messages: List[ChatCompletionRequestMessage] = Field( - default=[], - description="A list of messages to generate completions for." + default=[], description="A list of messages to generate completions for." ) max_tokens: int = max_tokens_field temperature: float = temperature_field From 5a3413eee398ff36e0fb496a44e39d960f402a48 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 03:03:57 -0400 Subject: [PATCH 179/443] Update cpu_count --- llama_cpp/server/app.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 48dfc5e..dfb819c 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,8 +1,8 @@ -import os import json +import multiprocessing from threading import Lock from typing import List, Optional, Union, Iterator, Dict -from typing_extensions import TypedDict, Literal, Annotated +from typing_extensions import TypedDict, Literal import llama_cpp @@ -21,7 +21,7 @@ class Settings(BaseSettings): default=512, ge=1, description="The batch size to use per eval." ) n_threads: int = Field( - default=max((os.cpu_count() or 2) // 2, 1), + default=max(multiprocessing.cpu_count() // 2, 1), ge=1, description="The number of threads to use.", ) From 3fbda717904080ec0286fc13488e3318e3dec75e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 03:04:22 -0400 Subject: [PATCH 180/443] Fix mlock_supported and mmap_supported return type --- llama_cpp/llama_cpp.py | 12 ++++++++---- llama_cpp/server/app.py | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 527ed7c..bce0fd7 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -157,7 +157,7 @@ _lib.llama_context_default_params.argtypes = [] _lib.llama_context_default_params.restype = llama_context_params -def llama_mmap_supported() -> c_bool: +def llama_mmap_supported() -> bool: return _lib.llama_mmap_supported() @@ -165,7 +165,7 @@ _lib.llama_mmap_supported.argtypes = [] _lib.llama_mmap_supported.restype = c_bool -def llama_mlock_supported() -> c_bool: +def llama_mlock_supported() -> bool: return _lib.llama_mlock_supported() @@ -387,7 +387,9 @@ _lib.llama_n_embd.restype = c_int # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab -def llama_get_logits(ctx: llama_context_p): # type: (...) -> Array[float] # type: ignore +def llama_get_logits( + ctx: llama_context_p, +): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_logits(ctx) @@ -397,7 +399,9 @@ _lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input # shape: [n_embd] (1-dimensional) -def llama_get_embeddings(ctx: llama_context_p): # type: (...) -> Array[float] # type: ignore +def llama_get_embeddings( + ctx: llama_context_p, +): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_embeddings(ctx) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index dfb819c..3e45684 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -27,11 +27,11 @@ class Settings(BaseSettings): ) f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") use_mlock: bool = Field( - default=bool(llama_cpp.llama_mlock_supported().value), + default=llama_cpp.llama_mlock_supported(), description="Use mlock.", ) use_mmap: bool = Field( - default=bool(llama_cpp.llama_mmap_supported().value), + default=llama_cpp.llama_mmap_supported(), description="Use mmap.", ) embedding: bool = Field(default=True, description="Whether to use embeddings.") From 5f43c553d59f5ee8ca6bea3044d50ba40bc8b426 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 03:29:33 -0400 Subject: [PATCH 181/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a11faef..f6d1e9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.43" +version = "0.1.44" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 405886a..020d236 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.43", + version="0.1.44", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 627811ea837f6f3b108d916a5ae802111d0f0690 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 05:09:10 -0400 Subject: [PATCH 182/443] Add verbose flag to server --- llama_cpp/server/app.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 3e45684..f46f920 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -48,6 +48,9 @@ class Settings(BaseSettings): vocab_only: bool = Field( default=False, description="Whether to only return the vocabulary." ) + verbose: bool = Field( + default=True, description="Whether to print debug information." + ) router = APIRouter() @@ -83,6 +86,7 @@ def create_app(settings: Optional[Settings] = None): n_ctx=settings.n_ctx, last_n_tokens_size=settings.last_n_tokens_size, vocab_only=settings.vocab_only, + verbose=settings.verbose, ) if settings.cache: cache = llama_cpp.LlamaCache() From 3adc8fb3ae887d385b4a884814f9055c7165f168 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 05:10:52 -0400 Subject: [PATCH 183/443] Update README to use cli options for server --- README.md | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/README.md b/README.md index 9daca60..9fa3bed 100644 --- a/README.md +++ b/README.md @@ -68,18 +68,9 @@ This allows you to use llama.cpp compatible models with any OpenAI compatible cl To install the server package and get started: -Linux/MacOS ```bash pip install llama-cpp-python[server] -export MODEL=./models/7B/ggml-model.bin -python3 -m llama_cpp.server -``` - -Windows -```cmd -pip install llama-cpp-python[server] -SET MODEL=..\models\7B\ggml-model.bin -python3 -m llama_cpp.server +python3 -m llama_cpp.server --model models/7B/ggml-model.bin ``` Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. From 4f8cf52a38761f8cd611d3f65f07b6fe382445a9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 05:20:04 -0400 Subject: [PATCH 184/443] Update README --- README.md | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9fa3bed..b7772d9 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ This package provides: - OpenAI-like API - LangChain compatibility -## Installation +## Installation from PyPI (recommended) Install from PyPI (requires a c compiler): @@ -26,8 +26,30 @@ pip install llama-cpp-python The above command will attempt to install the package and build build `llama.cpp` from source. This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. -This method defaults to using `make` to build `llama.cpp` on Linux / MacOS and `cmake` on Windows. -You can force the use of `cmake` on Linux / MacOS setting the `FORCE_CMAKE=1` environment variable before installing. + +### Installation with OpenBLAS / cuBLAS / CLBlast + +`llama.cpp` supports multiple BLAS backends for faster processing. +Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend. + +To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing: + +```bash +LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python +``` + +To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing: + +```bash +LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python +``` + +To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing: + +```bash +LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python +``` + ## High-level API From 2753b853212bfb81a3643b69eb666443ad03d494 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 13:19:56 -0400 Subject: [PATCH 185/443] Format --- llama_cpp/llama.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 0db5c10..6836ea5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -127,9 +127,7 @@ class Llama: self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx) - self.eval_logits: Deque[List[float]] = deque( - maxlen=n_ctx if logits_all else 1 - ) + self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx if logits_all else 1) self.cache: Optional[LlamaCache] = None @@ -547,12 +545,6 @@ class Llama: finish_reason = "stop" break - if self.cache and len(completion_tokens) == 0: - if prompt_tokens not in self.cache: - if self.verbose: - print("Llama._create_completion: cache miss", file=sys.stderr) - self.cache[prompt_tokens] = self.save_state() - completion_tokens.append(token) all_text = self.detokenize(completion_tokens) @@ -611,6 +603,11 @@ class Llama: finish_reason = "length" break + if self.cache: + if self.verbose: + print("Llama._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() + if stream: yield { "id": completion_id, From 8dfde63255651f05e015df6dcfb614b2eac7c1f5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 19:30:14 -0400 Subject: [PATCH 186/443] Fix return type --- llama_cpp/llama_cpp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index bce0fd7..e60558c 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -260,7 +260,7 @@ _lib.llama_get_state_size.restype = c_size_t # Returns the number of bytes copied def llama_copy_state_data( ctx: llama_context_p, dest # type: Array[c_uint8] -) -> c_size_t: +) -> int: return _lib.llama_copy_state_data(ctx, dest) @@ -272,7 +272,7 @@ _lib.llama_copy_state_data.restype = c_size_t # Returns the number of bytes read def llama_set_state_data( ctx: llama_context_p, src # type: Array[c_uint8] -) -> c_size_t: +) -> int: return _lib.llama_set_state_data(ctx, src) From 0e94a70de1727c8071d5802c34ad83a1fee987b0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 19:31:26 -0400 Subject: [PATCH 187/443] Add in-memory longest prefix cache. Closes #158 --- llama_cpp/llama.py | 91 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 27 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6836ea5..de06da0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -5,7 +5,7 @@ import time import math import multiprocessing from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple -from collections import deque +from collections import deque, OrderedDict from . import llama_cpp from .llama_types import * @@ -14,37 +14,50 @@ from .llama_types import * class LlamaCache: """Cache for a llama.cpp model.""" - def __init__(self): - self.cache_state: Dict[Tuple[llama_cpp.llama_token, ...], "LlamaState"] = dict() + def __init__(self, capacity_bytes: int = (2 << 30)): + self.cache_state: OrderedDict[ + Tuple[llama_cpp.llama_token, ...], "LlamaState" + ] = OrderedDict() + self.capacity_bytes = capacity_bytes - def _sorted_keys(self) -> List[Tuple[llama_cpp.llama_token, ...]]: - return [ - key - for _, key in sorted( - ((len(key), key) for key in self.cache_state.keys()), reverse=True - ) - ] + @property + def cache_size(self): + return sum([state.llama_state_size for state in self.cache_state.values()]) - def _find_key( - self, key: Tuple[llama_cpp.llama_token, ...] + def _find_longest_prefix_key( + self, + key: Tuple[llama_cpp.llama_token, ...], ) -> Optional[Tuple[llama_cpp.llama_token, ...]]: - for k in self._sorted_keys(): - if key[: len(k)] == k: - return k - return None + min_len = 0 + min_key = None + keys = ( + (k, Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() + ) + for k, prefix_len in keys: + if prefix_len > min_len: + min_len = prefix_len + min_key = k + return min_key def __getitem__(self, key: Sequence[llama_cpp.llama_token]) -> "LlamaState": - _key = self._find_key(tuple(key)) + key = tuple(key) + _key = self._find_longest_prefix_key(key) if _key is None: - raise KeyError(f"Key not found: {key}") - return self.cache_state[_key] + raise KeyError(f"Key not found") + value = self.cache_state[_key] + self.cache_state.move_to_end(_key) + return value def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool: - return self._find_key(tuple(key)) is not None + return self._find_longest_prefix_key(tuple(key)) is not None def __setitem__(self, key: Sequence[llama_cpp.llama_token], value: "LlamaState"): - self.cache_state = dict() # NOTE: Currently limit to one cache entry. - self.cache_state[tuple(key)] = value + key = tuple(key) + if key in self.cache_state: + del self.cache_state[key] + self.cache_state[key] = value + while self.cache_size > self.capacity_bytes: + self.cache_state.popitem(last=False) class LlamaState: @@ -53,7 +66,7 @@ class LlamaState: eval_tokens: Deque[llama_cpp.llama_token], eval_logits: Deque[List[float]], llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] - llama_state_size: llama_cpp.c_size_t, + llama_state_size: int, ): self.eval_tokens = eval_tokens self.eval_logits = eval_logits @@ -526,10 +539,22 @@ class Llama: "logprobs is not supported for models created with logits_all=False" ) - if self.cache and prompt_tokens in self.cache: - if self.verbose: - print("Llama._create_completion: cache hit", file=sys.stderr) - self.load_state(self.cache[prompt_tokens]) + if self.cache: + try: + cache_item = self.cache[prompt_tokens] + cache_prefix_len = Llama.longest_token_prefix( + cache_item.eval_tokens, prompt_tokens + ) + eval_prefix_len = Llama.longest_token_prefix( + self.eval_tokens, prompt_tokens + ) + if cache_prefix_len > eval_prefix_len: + self.load_state(cache_item) + if self.verbose: + print("Llama._create_completion: cache hit", file=sys.stderr) + except KeyError: + if self.verbose: + print("Llama._create_completion: cache miss", file=sys.stderr) finish_reason = "length" multibyte_fix = 0 @@ -1004,3 +1029,15 @@ class Llama: exps = [math.exp(float(x)) for x in logits] sum_exps = sum(exps) return [math.log(x / sum_exps) for x in exps] + + @staticmethod + def longest_token_prefix( + a: Sequence[llama_cpp.llama_token], b: Sequence[llama_cpp.llama_token] + ): + longest_prefix = 0 + for _a, _b in zip(a, b): + if _a == _b: + longest_prefix += 1 + else: + break + return longest_prefix From 14da46f16e46dba2a6964c8d0d7ddbce388182e5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 19:33:17 -0400 Subject: [PATCH 188/443] Added cache size to settins object. --- llama_cpp/server/app.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index f46f920..e74d17d 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -45,6 +45,10 @@ class Settings(BaseSettings): default=False, description="Use a cache to reduce processing times for evaluated prompts.", ) + cache_size: int = Field( + default=2 << 30, + description="The size of the cache in bytes. Only used if cache is True.", + ) vocab_only: bool = Field( default=False, description="Whether to only return the vocabulary." ) @@ -89,7 +93,7 @@ def create_app(settings: Optional[Settings] = None): verbose=settings.verbose, ) if settings.cache: - cache = llama_cpp.LlamaCache() + cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) llama.set_cache(cache) return app From e72f58614b35ae3f995fd46897f2272d8f23362c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 20:01:34 -0400 Subject: [PATCH 189/443] Change pointer to lower overhead byref --- llama_cpp/llama.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index de06da0..41e6fd8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -295,47 +295,47 @@ class Llama: ctx=self.ctx, last_tokens_data=last_n_tokens_data, last_tokens_size=last_n_tokens_size, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore penalty=repeat_penalty, ) if float(temp.value) == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore ) else: llama_cpp.llama_sample_top_k( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore k=top_k, min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_tail_free( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore z=llama_cpp.c_float(1.0), min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_typical( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore p=llama_cpp.c_float(1.0), min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_top_p( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore p=top_p, min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore temp=temp, ) return llama_cpp.llama_sample_token( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore ) def sample( From a0b61ea2a7c27660bc1421802c327b379a47a7d7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 20:17:52 -0400 Subject: [PATCH 190/443] Bugfix for models endpoint --- llama_cpp/server/app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index e74d17d..b46914e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -357,7 +357,9 @@ GetModelResponse = create_model_from_typeddict(ModelList) @router.get("/v1/models", response_model=GetModelResponse) -def get_models() -> ModelList: +def get_models( + llama: llama_cpp.Llama = Depends(get_llama), +) -> ModelList: return { "object": "list", "data": [ From 75d8619b1a373a3900dbbdaf2fc7f71343ae312e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 20:19:34 -0400 Subject: [PATCH 191/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f6d1e9a..781d21b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.44" +version = "0.1.45" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 020d236..e2bc2da 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.44", + version="0.1.45", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 65d9cc050cb630a1d12f3874947b4729d1cbaab7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 01:30:18 -0400 Subject: [PATCH 192/443] Add openai frequency and presence penalty parameters. Closes #169 --- llama_cpp/llama.py | 38 ++++++++++++++++++++++++++++++++++++-- llama_cpp/server/app.py | 4 ---- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 41e6fd8..7b53112 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -261,7 +261,7 @@ class Llama: ] self.eval_logits.extend(logits) - def _sample_top_p_top_k( + def _sample( self, last_n_tokens_data, # type: llama_cpp.Array[llama_cpp.llama_token] last_n_tokens_size: llama_cpp.c_int, @@ -269,6 +269,8 @@ class Llama: top_p: llama_cpp.c_float, temp: llama_cpp.c_float, repeat_penalty: llama_cpp.c_float, + frequency_penalty: llama_cpp.c_float, + presence_penalty: llama_cpp.c_float, ): assert self.ctx is not None assert len(self.eval_logits) > 0 @@ -298,6 +300,14 @@ class Llama: candidates=llama_cpp.ctypes.byref(candidates), # type: ignore penalty=repeat_penalty, ) + llama_cpp.llama_sample_frequency_and_presence_penalties( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + last_tokens_data=last_n_tokens_data, + last_tokens_size=last_n_tokens_size, + alpha_frequency=frequency_penalty, + alpha_presence=presence_penalty, + ) if float(temp.value) == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, @@ -344,6 +354,8 @@ class Llama: top_p: float, temp: float, repeat_penalty: float, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, ): """Sample a token from the model. @@ -360,7 +372,7 @@ class Llama: last_n_tokens_data = [llama_cpp.llama_token(0)] * max( 0, self.last_n_tokens_size - len(self.eval_tokens) ) + list(self.eval_tokens)[-self.last_n_tokens_size :] - return self._sample_top_p_top_k( + return self._sample( last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)( *last_n_tokens_data ), @@ -369,6 +381,8 @@ class Llama: top_p=llama_cpp.c_float(top_p), temp=llama_cpp.c_float(temp), repeat_penalty=llama_cpp.c_float(repeat_penalty), + frequency_penalty=llama_cpp.c_float(frequency_penalty), + presence_penalty=llama_cpp.c_float(presence_penalty), ) def generate( @@ -378,6 +392,8 @@ class Llama: top_p: float, temp: float, repeat_penalty: float, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, reset: bool = True, ) -> Generator[ llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None @@ -431,6 +447,8 @@ class Llama: top_k=top_k, top_p=top_p, temp=temp, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, ) tokens_or_none = yield token @@ -505,6 +523,8 @@ class Llama: logprobs: Optional[int] = None, echo: bool = False, stop: Optional[List[str]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, @@ -563,6 +583,8 @@ class Llama: top_k=top_k, top_p=top_p, temp=temperature, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, ): if token == llama_cpp.llama_token_eos(): @@ -737,6 +759,8 @@ class Llama: logprobs: Optional[int] = None, echo: bool = False, stop: Optional[List[str]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, @@ -772,6 +796,8 @@ class Llama: logprobs=logprobs, echo=echo, stop=stop, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, @@ -792,6 +818,8 @@ class Llama: logprobs: Optional[int] = None, echo: bool = False, stop: Optional[List[str]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, @@ -827,6 +855,8 @@ class Llama: logprobs=logprobs, echo=echo, stop=stop, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, @@ -899,6 +929,8 @@ class Llama: stream: bool = False, stop: Optional[List[str]] = [], max_tokens: int = 256, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -932,6 +964,8 @@ class Llama: stream=stream, max_tokens=max_tokens, repeat_penalty=repeat_penalty, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index b46914e..c9f2aef 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -214,8 +214,6 @@ def create_completion( exclude={ "model", "n", - "frequency_penalty", - "presence_penalty", "best_of", "logit_bias", "user", @@ -315,8 +313,6 @@ def create_chat_completion( exclude={ "model", "n", - "presence_penalty", - "frequency_penalty", "logit_bias", "user", } From 0d751a69a78c0a2f7b83c894d6a98ceec8daa680 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 01:50:43 -0400 Subject: [PATCH 193/443] Set repeat_penalty to 0 by default --- llama_cpp/server/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index c9f2aef..b459b80 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -146,7 +146,7 @@ top_k_field = Field( ) repeat_penalty_field = Field( - default=1.0, + default=0.0, ge=0.0, description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.", From 2c0d9b182cd417338a85396660d9828070b3373f Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Mon, 8 May 2023 15:27:03 +0200 Subject: [PATCH 194/443] Fix session loading and saving in low level example chat --- .../low_level_api/low_level_api_chat_cpp.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 272b454..b86d723 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -112,16 +112,17 @@ specified) expect poor results""", file=sys.stderr) if (path.exists(self.params.path_session)): _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))() - _n_token_count_out = llama_cpp.c_int() + _n_token_count_out = llama_cpp.c_size_t() if (llama_cpp.llama_load_session_file( self.ctx, self.params.path_session.encode("utf8"), _session_tokens, self.params.n_ctx, ctypes.byref(_n_token_count_out) - ) != 0): + ) != 1): print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr) return + _n_token_count_out = _n_token_count_out.value self.session_tokens = _session_tokens[:_n_token_count_out] print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr) else: @@ -135,19 +136,21 @@ specified) expect poor results""", file=sys.stderr) raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})") # debug message about similarity of saved session, if applicable - n_matching_session_tokens = 0 + self.n_matching_session_tokens = 0 if len(self.session_tokens) > 0: for id in self.session_tokens: - if n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[n_matching_session_tokens]: + if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]: break - n_matching_session_tokens += 1 + self.n_matching_session_tokens += 1 - if n_matching_session_tokens >= len(self.embd_inp): + if self.n_matching_session_tokens >= len(self.embd_inp): print(f"session file has exact match for prompt!") - elif n_matching_session_tokens < (len(self.embd_inp) / 2): - print(f"warning: session file has low similarity to prompt ({n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") + elif self.n_matching_session_tokens < (len(self.embd_inp) / 2): + print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") else: - print(f"session file matches {n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") + print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") + + self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) # number of tokens to keep when resetting context if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct): @@ -232,9 +235,6 @@ n_keep = {self.params.n_keep} """, file=sys.stderr) self.set_color(util.CONSOLE_COLOR_PROMPT) - self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) - - # tokenize a prompt def _tokenize(self, prompt, bos=True): _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))() @@ -302,7 +302,7 @@ n_keep = {self.params.n_keep} ) != 0): raise Exception("Failed to llama_eval!") - if len(self.embd) > 0 and not len(self.params.path_session) > 0: + if len(self.embd) > 0 and len(self.params.path_session) > 0: self.session_tokens.extend(self.embd) self.n_session_consumed = len(self.session_tokens) @@ -319,7 +319,7 @@ n_keep = {self.params.n_keep} llama_cpp.llama_save_session_file( self.ctx, self.params.path_session.encode("utf8"), - self.session_tokens, + (llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens), len(self.session_tokens) ) From eaf9f19aa98fa93fb078f31c6f65ce176629f808 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Mon, 8 May 2023 15:27:42 +0200 Subject: [PATCH 195/443] Fix lora --- examples/low_level_api/low_level_api_chat_cpp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index b86d723..8773cb1 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -76,8 +76,8 @@ specified) expect poor results""", file=sys.stderr) if (len(self.params.lora_adapter) > 0): if (llama_cpp.llama_apply_lora_from_file( self.ctx, - self.params.lora_adapter, - self.params.lora_base if len(self.params.lora_base) > 0 else None, + self.params.lora_adapter.encode("utf8"), + self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None, self.params.n_threads ) != 0): print("error: failed to apply lora adapter") From 022e9ebcb82092f3a2df2d2812796b34f35c6e53 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 14:20:53 -0400 Subject: [PATCH 196/443] Use environment variable if parsed cli arg is None --- llama_cpp/server/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 5c9598a..57e0bf1 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -40,7 +40,7 @@ if __name__ == "__main__": ) args = parser.parse_args() - settings = Settings(**vars(args)) + settings = Settings(**{k: v for k, v in vars(args).items() if v is not None}) app = create_app(settings=settings) uvicorn.run( From 0d6c60097a1bab3f66f57bb20bfd7b513ffd0ff9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 14:21:15 -0400 Subject: [PATCH 197/443] Show default value when --help is called --- llama_cpp/server/__main__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 57e0bf1..18011e3 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -29,7 +29,9 @@ import uvicorn from llama_cpp.server.app import create_app, Settings if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) for name, field in Settings.__fields__.items(): parser.add_argument( f"--{name}", From 6d69461ef55cccffd2b4ad2635081b31e7be6654 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 14:21:47 -0400 Subject: [PATCH 198/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 781d21b..1eed653 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.45" +version = "0.1.46" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index e2bc2da..504515e 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.45", + version="0.1.46", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 29f094bbcf21f24d6bdb1a4ee95d3a501387f08f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 14:46:25 -0400 Subject: [PATCH 199/443] Bugfix: not falling back to environment variables when default is value is set. --- llama_cpp/server/__main__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 18011e3..4fe1d94 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -29,16 +29,16 @@ import uvicorn from llama_cpp.server.app import create_app, Settings if __name__ == "__main__": - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) + parser = argparse.ArgumentParser() for name, field in Settings.__fields__.items(): + description = field.field_info.description + if field.default is not None and description is not None: + description += f" (default: {field.default})" parser.add_argument( f"--{name}", dest=name, type=field.type_, - default=field.default, - help=field.field_info.description, + help=description, ) args = parser.parse_args() From a3cc7bf5b2d790d528d851db0dfb624a73953e6c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 14:46:50 -0400 Subject: [PATCH 200/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1eed653..24375ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.46" +version = "0.1.47" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 504515e..f965c0d 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.46", + version="0.1.47", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From b1489befda06ef15d224dc09fe8121e8c33ed1fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 May 2023 21:04:42 +0000 Subject: [PATCH 201/443] Bump mkdocs-material from 9.1.9 to 9.1.11 Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.9 to 9.1.11. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.9...9.1.11) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 5b364a7..5474bf4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -773,14 +773,14 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.9" +version = "9.1.11" description = "Documentation that simply works" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.9-py3-none-any.whl", hash = "sha256:7db24261cb17400e132c46d17eea712bfe71056d892a9beba32cf68210297141"}, - {file = "mkdocs_material-9.1.9.tar.gz", hash = "sha256:74d8da1371ab3a326868fe47bae3cbc4aa22e93c048b4ca5117e6817b88bd734"}, + {file = "mkdocs_material-9.1.11-py3-none-any.whl", hash = "sha256:fbc86d50ec2cf34d40d5c4365780f290ceedde23f1a0704323b34e7f16b0c0dd"}, + {file = "mkdocs_material-9.1.11.tar.gz", hash = "sha256:f5d473eb79d6640a5e668d4b2ab5b9de5e76ae0a0e2d864112df0cfe9016dc1d"}, ] [package.dependencies] @@ -1439,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "e87403dcd0a0b8484436b02c392326adfaf22b8d7e182d77e4a155c67a7435bc" +content-hash = "6bea74d847b958639276d4be527c2b65dafeb0a455b6e3d1f29fee5171ce73b2" diff --git a/pyproject.toml b/pyproject.toml index 24375ef..5e12338 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.21.2"} -mkdocs-material = "^9.1.9" +mkdocs-material = "^9.1.11" pytest = "^7.3.1" httpx = "^0.24.0" From 82d138fe547b6013743f8b712d37097d5433176f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 18:49:11 -0400 Subject: [PATCH 202/443] Fix: default repeat_penalty --- llama_cpp/server/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index b459b80..621b73e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -146,7 +146,7 @@ top_k_field = Field( ) repeat_penalty_field = Field( - default=0.0, + default=1.1, ge=0.0, description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.", From c37883b477a8032c2b434ad6ce873694038c1b69 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 18:49:37 -0400 Subject: [PATCH 203/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 24375ef..775fd49 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.47" +version = "0.1.48" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index f965c0d..f4cbb60 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.47", + version="0.1.48", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From f315b82832b8a47e3c9e56d1c450858333339d36 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 19:53:21 -0400 Subject: [PATCH 204/443] Revert changes to llama.cpp and setup.py --- setup.py | 3 +-- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 0b90312..f4cbb60 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.43", + version="0.1.48", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", @@ -22,7 +22,6 @@ setup( extras_require={ "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], }, - cmake_args=['-DLLAMA_CUBLAS=ON'], python_requires=">=3.7", classifiers=[ "Programming Language :: Python :: 3", diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 173d0e6..1b0fd45 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 173d0e6419e8f8f3c1f4f13201b777f4c60629f3 +Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25 From d957422bf4e5bd400ae4e463b5c83d84a741d3cb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 21:21:25 -0400 Subject: [PATCH 205/443] Implement sampling as in llama.cpp main example --- llama_cpp/llama.py | 162 +++++++++++++++++++++------------------------ 1 file changed, 76 insertions(+), 86 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4b4fb01..7be51e1 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -268,14 +268,13 @@ class Llama: top_k: llama_cpp.c_int, top_p: llama_cpp.c_float, temp: llama_cpp.c_float, - mirostat_mode: llama_cpp.c_int, - mirostat_tau: llama_cpp.c_float, - mirostat_eta: llama_cpp.c_float, - mirostat_mu: llama_cpp.c_float, - mirostat_m: llama_cpp.c_int, + tfs_z: llama_cpp.c_float, repeat_penalty: llama_cpp.c_float, frequency_penalty: llama_cpp.c_float, presence_penalty: llama_cpp.c_float, + mirostat_mode: llama_cpp.c_int, + mirostat_tau: llama_cpp.c_float, + mirostat_eta: llama_cpp.c_float, ): assert self.ctx is not None assert len(self.eval_logits) > 0 @@ -305,33 +304,6 @@ class Llama: candidates=llama_cpp.ctypes.byref(candidates), # type: ignore penalty=repeat_penalty, ) - if mirostat_mode.value == 1: - llama_cpp.llama_sample_temperature( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - temp=temp, - ) - llama_cpp.llama_sample_token_mirostat( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - tau=mirostat_tau, - eta=mirostat_eta, - mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore - m=mirostat_m - ) - elif mirostat_mode.value == 2: - llama_cpp.llama_sample_temperature( - ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), - temp=temp, - ) - llama_cpp.llama_sample_token_mirostat_v2( - ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - tau=mirostat_tau, - eta=mirostat_eta, - mu=llama_cpp.ctypes.byref(mirostat_mu) # type: ignore - ) llama_cpp.llama_sample_frequency_and_presence_penalties( ctx=self.ctx, candidates=llama_cpp.ctypes.byref(candidates), # type: ignore @@ -340,11 +312,41 @@ class Llama: alpha_frequency=frequency_penalty, alpha_presence=presence_penalty, ) - if float(temp.value) == 0.0: + if temp.value == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, candidates=llama_cpp.ctypes.byref(candidates), # type: ignore ) + elif mirostat_mode.value == 1: + mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) + mirostat_m = llama_cpp.c_int(100) + llama_cpp.llama_sample_temperature( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + temp=temp, + ) + return llama_cpp.llama_sample_token_mirostat( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + tau=mirostat_tau, + eta=mirostat_eta, + mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore + m=mirostat_m, + ) + elif mirostat_mode.value == 2: + mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) + llama_cpp.llama_sample_temperature( + ctx=self.ctx, + candidates=llama_cpp.ctypes.pointer(candidates), + temp=temp, + ) + return llama_cpp.llama_sample_token_mirostat_v2( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + tau=mirostat_tau, + eta=mirostat_eta, + mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore + ) else: llama_cpp.llama_sample_top_k( ctx=self.ctx, @@ -355,7 +357,7 @@ class Llama: llama_cpp.llama_sample_tail_free( ctx=self.ctx, candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - z=llama_cpp.c_float(1.0), + z=tfs_z, min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_typical( @@ -382,17 +384,16 @@ class Llama: def sample( self, - top_k: int, - top_p: float, - temp: float, - mirostat_mode: int, - mirostat_tau: float, - mirostat_eta: float, - mirostat_mu: float, - mirostat_m: int, - repeat_penalty: float, + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_eta: float = 0.1, + mirostat_tau: float = 5.0, ): """Sample a token from the model. @@ -417,14 +418,13 @@ class Llama: top_k=llama_cpp.c_int(top_k), top_p=llama_cpp.c_float(top_p), temp=llama_cpp.c_float(temp), - mirostat_mode=llama_cpp.c_int(mirostat_mode), - mirostat_mu=llama_cpp.c_float(mirostat_mu), - mirostat_tau=llama_cpp.c_float(mirostat_tau), - mirostat_eta=llama_cpp.c_float(mirostat_eta), - mirostat_m=llama_cpp.c_int(mirostat_m), + tfs_z=llama_cpp.c_float(tfs_z), repeat_penalty=llama_cpp.c_float(repeat_penalty), frequency_penalty=llama_cpp.c_float(frequency_penalty), presence_penalty=llama_cpp.c_float(presence_penalty), + mirostat_mode=llama_cpp.c_int(mirostat_mode), + mirostat_tau=llama_cpp.c_float(mirostat_tau), + mirostat_eta=llama_cpp.c_float(mirostat_eta), ) def generate( @@ -433,15 +433,13 @@ class Llama: top_k: int, top_p: float, temp: float, - mirostat_mode: int, - mirostat_tau: float, - mirostat_eta: float, - mirostat_mu: float, - mirostat_m: int, repeat_penalty: float, + reset: bool = True, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, - reset: bool = True, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, ) -> Generator[ llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None ]: @@ -494,14 +492,12 @@ class Llama: top_k=top_k, top_p=top_p, temp=temp, + repeat_penalty=repeat_penalty, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, - mirostat_mu=mirostat_mu, - mirostat_m=mirostat_m, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - repeat_penalty=repeat_penalty, ) tokens_or_none = yield token tokens = [token] @@ -571,11 +567,6 @@ class Llama: suffix: Optional[str] = None, max_tokens: int = 16, temperature: float = 0.8, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - mirostat_mu: float = 10, - mirostat_m: int = 100, top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, @@ -585,6 +576,9 @@ class Llama: repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None completion_id: str = f"cmpl-{str(uuid.uuid4())}" @@ -643,8 +637,6 @@ class Llama: mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, - mirostat_mu=mirostat_mu, - mirostat_m=mirostat_m, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, @@ -817,11 +809,6 @@ class Llama: suffix: Optional[str] = None, max_tokens: int = 128, temperature: float = 0.8, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - mirostat_mu: float = 10, - mirostat_m: int = 100, top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, @@ -831,6 +818,9 @@ class Llama: repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -859,11 +849,6 @@ class Llama: suffix=suffix, max_tokens=max_tokens, temperature=temperature, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - mirostat_mu=mirostat_mu, - mirostat_m=mirostat_m, top_p=top_p, logprobs=logprobs, echo=echo, @@ -873,6 +858,9 @@ class Llama: repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks @@ -886,11 +874,6 @@ class Llama: suffix: Optional[str] = None, max_tokens: int = 128, temperature: float = 0.8, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - mirostat_mu: float = 10, - mirostat_m: int = 100, top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, @@ -900,6 +883,9 @@ class Llama: repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -928,11 +914,6 @@ class Llama: suffix=suffix, max_tokens=max_tokens, temperature=temperature, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - mirostat_mu=mirostat_mu, - mirostat_m=mirostat_m, top_p=top_p, logprobs=logprobs, echo=echo, @@ -942,6 +923,9 @@ class Llama: repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, ) def _convert_text_completion_to_chat( @@ -1014,6 +998,9 @@ class Llama: presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -1048,6 +1035,9 @@ class Llama: repeat_penalty=repeat_penalty, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore From 17dc51a7d272705bb6ec6b33910ecc091403566e Mon Sep 17 00:00:00 2001 From: Joel Kurian Date: Tue, 9 May 2023 21:34:46 +0530 Subject: [PATCH 206/443] Updated installation instructions for BLAS backends --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b7772d9..9f494f9 100644 --- a/README.md +++ b/README.md @@ -35,19 +35,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing: ```bash -LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python +CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python ``` To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing: ```bash -LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python +CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python ``` To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing: ```bash -LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python +CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python ``` From 99c016c9b367641d3b04d91ca77590b53d288b28 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sun, 30 Apr 2023 23:21:16 -0700 Subject: [PATCH 207/443] pyproject.toml: add skbuild to dev dependencies The README says to run `python3 setup.py develop` to build the library, however on a blank install this fails because scikit-build is not installed. This adds it to the dev dependencies so that it is installed. --- poetry.lock | 372 ++++++++++++++++++++++++++++++------------------- pyproject.toml | 1 + 2 files changed, 231 insertions(+), 142 deletions(-) diff --git a/poetry.lock b/poetry.lock index 5474bf4..861de10 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "anyio" @@ -92,14 +92,14 @@ css = ["tinycss2 (>=1.1.0,<1.2)"] [[package]] name = "certifi" -version = "2022.12.7" +version = "2023.5.7" description = "Python package for providing Mozilla's CA Bundle." category = "dev" optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"}, - {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"}, + {file = "certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"}, + {file = "certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"}, ] [[package]] @@ -293,35 +293,31 @@ files = [ [[package]] name = "cryptography" -version = "39.0.2" +version = "40.0.2" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." category = "dev" optional = false python-versions = ">=3.6" files = [ - {file = "cryptography-39.0.2-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:2725672bb53bb92dc7b4150d233cd4b8c59615cd8288d495eaa86db00d4e5c06"}, - {file = "cryptography-39.0.2-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:23df8ca3f24699167daf3e23e51f7ba7334d504af63a94af468f468b975b7dd7"}, - {file = "cryptography-39.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:eb40fe69cfc6f5cdab9a5ebd022131ba21453cf7b8a7fd3631f45bbf52bed612"}, - {file = "cryptography-39.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc0521cce2c1d541634b19f3ac661d7a64f9555135e9d8af3980965be717fd4a"}, - {file = "cryptography-39.0.2-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffd394c7896ed7821a6d13b24657c6a34b6e2650bd84ae063cf11ccffa4f1a97"}, - {file = "cryptography-39.0.2-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:e8a0772016feeb106efd28d4a328e77dc2edae84dfbac06061319fdb669ff828"}, - {file = "cryptography-39.0.2-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8f35c17bd4faed2bc7797d2a66cbb4f986242ce2e30340ab832e5d99ae60e011"}, - {file = "cryptography-39.0.2-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b49a88ff802e1993b7f749b1eeb31134f03c8d5c956e3c125c75558955cda536"}, - {file = "cryptography-39.0.2-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5f8c682e736513db7d04349b4f6693690170f95aac449c56f97415c6980edef5"}, - {file = "cryptography-39.0.2-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:d7d84a512a59f4412ca8549b01f94be4161c94efc598bf09d027d67826beddc0"}, - {file = "cryptography-39.0.2-cp36-abi3-win32.whl", hash = "sha256:c43ac224aabcbf83a947eeb8b17eaf1547bce3767ee2d70093b461f31729a480"}, - {file = "cryptography-39.0.2-cp36-abi3-win_amd64.whl", hash = "sha256:788b3921d763ee35dfdb04248d0e3de11e3ca8eb22e2e48fef880c42e1f3c8f9"}, - {file = "cryptography-39.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:d15809e0dbdad486f4ad0979753518f47980020b7a34e9fc56e8be4f60702fac"}, - {file = "cryptography-39.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:50cadb9b2f961757e712a9737ef33d89b8190c3ea34d0fb6675e00edbe35d074"}, - {file = "cryptography-39.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:103e8f7155f3ce2ffa0049fe60169878d47a4364b277906386f8de21c9234aa1"}, - {file = "cryptography-39.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6236a9610c912b129610eb1a274bdc1350b5df834d124fa84729ebeaf7da42c3"}, - {file = "cryptography-39.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e944fe07b6f229f4c1a06a7ef906a19652bdd9fd54c761b0ff87e83ae7a30354"}, - {file = "cryptography-39.0.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:35d658536b0a4117c885728d1a7032bdc9a5974722ae298d6c533755a6ee3915"}, - {file = "cryptography-39.0.2-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:30b1d1bfd00f6fc80d11300a29f1d8ab2b8d9febb6ed4a38a76880ec564fae84"}, - {file = "cryptography-39.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e029b844c21116564b8b61216befabca4b500e6816fa9f0ba49527653cae2108"}, - {file = "cryptography-39.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fa507318e427169ade4e9eccef39e9011cdc19534f55ca2f36ec3f388c1f70f3"}, - {file = "cryptography-39.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:8bc0008ef798231fac03fe7d26e82d601d15bd16f3afaad1c6113771566570f3"}, - {file = "cryptography-39.0.2.tar.gz", hash = "sha256:bc5b871e977c8ee5a1bbc42fa8d19bcc08baf0c51cbf1586b0e87a2694dde42f"}, + {file = "cryptography-40.0.2-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:8f79b5ff5ad9d3218afb1e7e20ea74da5f76943ee5edb7f76e56ec5161ec782b"}, + {file = "cryptography-40.0.2-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:05dc219433b14046c476f6f09d7636b92a1c3e5808b9a6536adf4932b3b2c440"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4df2af28d7bedc84fe45bd49bc35d710aede676e2a4cb7fc6d103a2adc8afe4d"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dcca15d3a19a66e63662dc8d30f8036b07be851a8680eda92d079868f106288"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:a04386fb7bc85fab9cd51b6308633a3c271e3d0d3eae917eebab2fac6219b6d2"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:adc0d980fd2760c9e5de537c28935cc32b9353baaf28e0814df417619c6c8c3b"}, + {file = "cryptography-40.0.2-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d5a1bd0e9e2031465761dfa920c16b0065ad77321d8a8c1f5ee331021fda65e9"}, + {file = "cryptography-40.0.2-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a95f4802d49faa6a674242e25bfeea6fc2acd915b5e5e29ac90a32b1139cae1c"}, + {file = "cryptography-40.0.2-cp36-abi3-win32.whl", hash = "sha256:aecbb1592b0188e030cb01f82d12556cf72e218280f621deed7d806afd2113f9"}, + {file = "cryptography-40.0.2-cp36-abi3-win_amd64.whl", hash = "sha256:b12794f01d4cacfbd3177b9042198f3af1c856eedd0a98f10f141385c809a14b"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:142bae539ef28a1c76794cca7f49729e7c54423f615cfd9b0b1fa90ebe53244b"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:956ba8701b4ffe91ba59665ed170a2ebbdc6fc0e40de5f6059195d9f2b33ca0e"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4f01c9863da784558165f5d4d916093737a75203a5c5286fde60e503e4276c7a"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3daf9b114213f8ba460b829a02896789751626a2a4e7a43a28ee77c04b5e4958"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48f388d0d153350f378c7f7b41497a54ff1513c816bcbbcafe5b829e59b9ce5b"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c0764e72b36a3dc065c155e5b22f93df465da9c39af65516fe04ed3c68c92636"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:cbaba590180cba88cb99a5f76f90808a624f18b169b90a4abb40c1fd8c19420e"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7a38250f433cd41df7fcb763caa3ee9362777fdb4dc642b9a349721d2bf47404"}, + {file = "cryptography-40.0.2.tar.gz", hash = "sha256:c33c0d32b8594fa647d2e01dbccc303478e16fdd7cf98652d5b3ed11aa5e5c99"}, ] [package.dependencies] @@ -330,23 +326,35 @@ cffi = ">=1.12" [package.extras] docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] -pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"] +pep8test = ["black", "check-manifest", "mypy", "ruff"] sdist = ["setuptools-rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"] +test = ["iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist"] test-randomorder = ["pytest-randomly"] tox = ["tox"] +[[package]] +name = "distro" +version = "1.8.0" +description = "Distro - an OS platform information API" +category = "dev" +optional = false +python-versions = ">=3.6" +files = [ + {file = "distro-1.8.0-py3-none-any.whl", hash = "sha256:99522ca3e365cac527b44bde033f64c6945d90eb9f769703caaec52b09bbd3ff"}, + {file = "distro-1.8.0.tar.gz", hash = "sha256:02e111d1dc6a50abb8eed6bf31c3e48ed8b0830d1ea2a1b78c61765c2513fdd8"}, +] + [[package]] name = "docutils" -version = "0.19" +version = "0.20" description = "Docutils -- Python Documentation Utilities" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "docutils-0.19-py3-none-any.whl", hash = "sha256:5e1de4d849fee02c63b040a4a3fd567f4ab104defd8a5511fbbc24a8a017efbc"}, - {file = "docutils-0.19.tar.gz", hash = "sha256:33995a6753c30b7f577febfc2c50411fec6aac7f7ffeb7c4cfe5991072dcf9e6"}, + {file = "docutils-0.20-py3-none-any.whl", hash = "sha256:a428f10de4de4774389734c986a01b4af2d802d26717108b0f1b9356862937c5"}, + {file = "docutils-0.20.tar.gz", hash = "sha256:f75a5a52fbcacd81b47e42888ad2b380748aaccfb3f13af0fe69deb759f01eb6"}, ] [[package]] @@ -384,22 +392,19 @@ dev = ["flake8", "markdown", "twine", "wheel"] [[package]] name = "griffe" -version = "0.25.5" +version = "0.27.3" description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "griffe-0.25.5-py3-none-any.whl", hash = "sha256:1fb9edff48e66d4873014a2ebf21aca5f271d0006a4c937826e3cf592ffb3706"}, - {file = "griffe-0.25.5.tar.gz", hash = "sha256:11ea3403ef0560a1cbcf7f302eb5d21cf4c1d8ed3f8a16a75aa9f6f458caf3f1"}, + {file = "griffe-0.27.3-py3-none-any.whl", hash = "sha256:094513b209d4acd4b2680c2415d3af5f8ed925714795380c2a7d070e222e0b27"}, + {file = "griffe-0.27.3.tar.gz", hash = "sha256:a3d0f75aa76b80f181f818cf605f658a69fccf287aaeeeafc7a6cf4e6a2ca27e"}, ] [package.dependencies] colorama = ">=0.4" -[package.extras] -async = ["aiofiles (>=0.7,<1.0)"] - [[package]] name = "h11" version = "0.14.0" @@ -472,14 +477,14 @@ files = [ [[package]] name = "importlib-metadata" -version = "6.1.0" +version = "6.6.0" description = "Read metadata from Python packages" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "importlib_metadata-6.1.0-py3-none-any.whl", hash = "sha256:ff80f3b5394912eb1b108fcfd444dc78b7f1f3e16b16188054bd01cb9cb86f09"}, - {file = "importlib_metadata-6.1.0.tar.gz", hash = "sha256:43ce9281e097583d758c2c708c4376371261a02c34682491a8e98352365aad20"}, + {file = "importlib_metadata-6.6.0-py3-none-any.whl", hash = "sha256:43dd286a2cd8995d5eaef7fee2066340423b818ed3fd70adf0bad5f1fac53fed"}, + {file = "importlib_metadata-6.6.0.tar.gz", hash = "sha256:92501cdf9cc66ebd3e612f1b4f0c0765dfa42f0fa38ffb319b6bd84dd675d705"}, ] [package.dependencies] @@ -835,19 +840,19 @@ python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] [[package]] name = "mkdocstrings-python" -version = "0.8.3" +version = "0.10.1" description = "A Python handler for mkdocstrings." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocstrings-python-0.8.3.tar.gz", hash = "sha256:9ae473f6dc599339b09eee17e4d2b05d6ac0ec29860f3fc9b7512d940fc61adf"}, - {file = "mkdocstrings_python-0.8.3-py3-none-any.whl", hash = "sha256:4e6e1cd6f37a785de0946ced6eb846eb2f5d891ac1cc2c7b832943d3529087a7"}, + {file = "mkdocstrings_python-0.10.1-py3-none-any.whl", hash = "sha256:ef239cee2c688e2b949a0a47e42a141d744dd12b7007311b3309dc70e3bafc5c"}, + {file = "mkdocstrings_python-0.10.1.tar.gz", hash = "sha256:b72301fff739070ec517b5b36bf2f7c49d1360a275896a64efb97fc17d3f3968"}, ] [package.dependencies] griffe = ">=0.24" -mkdocstrings = ">=0.19" +mkdocstrings = ">=0.20" [[package]] name = "more-itertools" @@ -875,14 +880,14 @@ files = [ [[package]] name = "packaging" -version = "23.0" +version = "23.1" description = "Core utilities for Python packages" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, - {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, + {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, + {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] [[package]] @@ -914,19 +919,19 @@ testing = ["pytest", "pytest-cov"] [[package]] name = "platformdirs" -version = "3.1.1" +version = "3.5.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "platformdirs-3.1.1-py3-none-any.whl", hash = "sha256:e5986afb596e4bb5bde29a79ac9061aa955b94fca2399b7aaac4090860920dd8"}, - {file = "platformdirs-3.1.1.tar.gz", hash = "sha256:024996549ee88ec1a9aa99ff7f8fc819bb59e2c3477b410d90a16d32d6e707aa"}, + {file = "platformdirs-3.5.0-py3-none-any.whl", hash = "sha256:47692bc24c1958e8b0f13dd727307cff1db103fca36399f457da8e05f222fdc4"}, + {file = "platformdirs-3.5.0.tar.gz", hash = "sha256:7954a68d0ba23558d753f73437c55f89027cf8f5108c19844d4b82e5af396335"}, ] [package.extras] -docs = ["furo (>=2022.12.7)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.22,!=1.23.4)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.2.2)", "pytest (>=7.2.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] +docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] [[package]] name = "pluggy" @@ -958,14 +963,14 @@ files = [ [[package]] name = "pygments" -version = "2.14.0" +version = "2.15.1" description = "Pygments is a syntax highlighting package written in Python." category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "Pygments-2.14.0-py3-none-any.whl", hash = "sha256:fa7bd7bd2771287c0de303af8bfdfc731f51bd2c6a47ab69d117138893b82717"}, - {file = "Pygments-2.14.0.tar.gz", hash = "sha256:b3ed06a9e8ac9a9aae5a6f5dbe78a8a58655d17b43b93c078f094ddc476ae297"}, + {file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"}, + {file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"}, ] [package.extras] @@ -973,14 +978,14 @@ plugins = ["importlib-metadata"] [[package]] name = "pymdown-extensions" -version = "9.10" +version = "9.11" description = "Extension pack for Python Markdown." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "pymdown_extensions-9.10-py3-none-any.whl", hash = "sha256:31eaa76ce6f96aabfcea98787c2fff2c5c0611b20a53a94213970cfbf05f02b8"}, - {file = "pymdown_extensions-9.10.tar.gz", hash = "sha256:562c38eee4ce3f101ce631b804bfc2177a8a76c7e4dc908871fb6741a90257a7"}, + {file = "pymdown_extensions-9.11-py3-none-any.whl", hash = "sha256:a499191d8d869f30339de86fcf072a787e86c42b6f16f280f5c2cf174182b7f3"}, + {file = "pymdown_extensions-9.11.tar.gz", hash = "sha256:f7e86c1d3981f23d9dc43294488ecb54abadd05b0be4bf8f0e15efc90f7853ff"}, ] [package.dependencies] @@ -1124,91 +1129,119 @@ md = ["cmarkgfm (>=0.8.0)"] [[package]] name = "regex" -version = "2023.3.23" +version = "2023.5.5" description = "Alternative regular expression module, to replace re." category = "dev" optional = false -python-versions = ">=3.8" +python-versions = ">=3.6" files = [ - {file = "regex-2023.3.23-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:845a5e2d84389c4ddada1a9b95c055320070f18bb76512608374aca00d22eca8"}, - {file = "regex-2023.3.23-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:87d9951f5a538dd1d016bdc0dcae59241d15fa94860964833a54d18197fcd134"}, - {file = "regex-2023.3.23-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37ae17d3be44c0b3f782c28ae9edd8b47c1f1776d4cabe87edc0b98e1f12b021"}, - {file = "regex-2023.3.23-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0b8eb1e3bca6b48dc721818a60ae83b8264d4089a4a41d62be6d05316ec38e15"}, - {file = "regex-2023.3.23-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df45fac182ebc3c494460c644e853515cc24f5ad9da05f8ffb91da891bfee879"}, - {file = "regex-2023.3.23-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7006105b10b59971d3b248ad75acc3651c7e4cf54d81694df5a5130a3c3f7ea"}, - {file = "regex-2023.3.23-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:93f3f1aa608380fe294aa4cb82e2afda07a7598e828d0341e124b8fd9327c715"}, - {file = "regex-2023.3.23-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:787954f541ab95d8195d97b0b8cf1dc304424adb1e07365967e656b92b38a699"}, - {file = "regex-2023.3.23-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:20abe0bdf03630fe92ccafc45a599bca8b3501f48d1de4f7d121153350a2f77d"}, - {file = "regex-2023.3.23-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:11d00c31aeab9a6e0503bc77e73ed9f4527b3984279d997eb145d7c7be6268fd"}, - {file = "regex-2023.3.23-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:d5bbe0e1511b844794a3be43d6c145001626ba9a6c1db8f84bdc724e91131d9d"}, - {file = "regex-2023.3.23-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ea3c0cb56eadbf4ab2277e7a095676370b3e46dbfc74d5c383bd87b0d6317910"}, - {file = "regex-2023.3.23-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d895b4c863059a4934d3e874b90998df774644a41b349ebb330f85f11b4ef2c0"}, - {file = "regex-2023.3.23-cp310-cp310-win32.whl", hash = "sha256:9d764514d19b4edcc75fd8cb1423448ef393e8b6cbd94f38cab983ab1b75855d"}, - {file = "regex-2023.3.23-cp310-cp310-win_amd64.whl", hash = "sha256:11d1f2b7a0696dc0310de0efb51b1f4d813ad4401fe368e83c0c62f344429f98"}, - {file = "regex-2023.3.23-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8a9c63cde0eaa345795c0fdeb19dc62d22e378c50b0bc67bf4667cd5b482d98b"}, - {file = "regex-2023.3.23-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dd7200b4c27b68cf9c9646da01647141c6db09f48cc5b51bc588deaf8e98a797"}, - {file = "regex-2023.3.23-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22720024b90a6ba673a725dcc62e10fb1111b889305d7c6b887ac7466b74bedb"}, - {file = "regex-2023.3.23-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b190a339090e6af25f4a5fd9e77591f6d911cc7b96ecbb2114890b061be0ac1"}, - {file = "regex-2023.3.23-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e76b6fc0d8e9efa39100369a9b3379ce35e20f6c75365653cf58d282ad290f6f"}, - {file = "regex-2023.3.23-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7868b8f218bf69a2a15402fde08b08712213a1f4b85a156d90473a6fb6b12b09"}, - {file = "regex-2023.3.23-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2472428efc4127374f494e570e36b30bb5e6b37d9a754f7667f7073e43b0abdd"}, - {file = "regex-2023.3.23-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c37df2a060cb476d94c047b18572ee2b37c31f831df126c0da3cd9227b39253d"}, - {file = "regex-2023.3.23-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4479f9e2abc03362df4045b1332d4a2b7885b245a30d4f4b051c4083b97d95d8"}, - {file = "regex-2023.3.23-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e2396e0678167f2d0c197da942b0b3fb48fee2f0b5915a0feb84d11b6686afe6"}, - {file = "regex-2023.3.23-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75f288c60232a5339e0ff2fa05779a5e9c74e9fc085c81e931d4a264501e745b"}, - {file = "regex-2023.3.23-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c869260aa62cee21c5eb171a466c0572b5e809213612ef8d495268cd2e34f20d"}, - {file = "regex-2023.3.23-cp311-cp311-win32.whl", hash = "sha256:25f0532fd0c53e96bad84664171969de9673b4131f2297f1db850d3918d58858"}, - {file = "regex-2023.3.23-cp311-cp311-win_amd64.whl", hash = "sha256:5ccfafd98473e007cebf7da10c1411035b7844f0f204015efd050601906dbb53"}, - {file = "regex-2023.3.23-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6572ff287176c0fb96568adb292674b421fa762153ed074d94b1d939ed92c253"}, - {file = "regex-2023.3.23-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a610e0adfcb0fc84ea25f6ea685e39e74cbcd9245a72a9a7aab85ff755a5ed27"}, - {file = "regex-2023.3.23-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:086afe222d58b88b62847bdbd92079b4699350b4acab892f88a935db5707c790"}, - {file = "regex-2023.3.23-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79e29fd62fa2f597a6754b247356bda14b866131a22444d67f907d6d341e10f3"}, - {file = "regex-2023.3.23-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c07ce8e9eee878a48ebeb32ee661b49504b85e164b05bebf25420705709fdd31"}, - {file = "regex-2023.3.23-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86b036f401895e854de9fefe061518e78d506d8a919cc250dc3416bca03f6f9a"}, - {file = "regex-2023.3.23-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78ac8dd8e18800bb1f97aad0d73f68916592dddf233b99d2b5cabc562088503a"}, - {file = "regex-2023.3.23-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:539dd010dc35af935b32f248099e38447bbffc10b59c2b542bceead2bed5c325"}, - {file = "regex-2023.3.23-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9bf4a5626f2a0ea006bf81e8963f498a57a47d58907eaa58f4b3e13be68759d8"}, - {file = "regex-2023.3.23-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cf86b4328c204c3f315074a61bc1c06f8a75a8e102359f18ce99fbcbbf1951f0"}, - {file = "regex-2023.3.23-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:2848bf76673c83314068241c8d5b7fa9ad9bed866c979875a0e84039349e8fa7"}, - {file = "regex-2023.3.23-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c125a02d22c555e68f7433bac8449992fa1cead525399f14e47c2d98f2f0e467"}, - {file = "regex-2023.3.23-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:cd1671e9d5ac05ce6aa86874dd8dfa048824d1dbe73060851b310c6c1a201a96"}, - {file = "regex-2023.3.23-cp38-cp38-win32.whl", hash = "sha256:fffe57312a358be6ec6baeb43d253c36e5790e436b7bf5b7a38df360363e88e9"}, - {file = "regex-2023.3.23-cp38-cp38-win_amd64.whl", hash = "sha256:dbb3f87e15d3dd76996d604af8678316ad2d7d20faa394e92d9394dfd621fd0c"}, - {file = "regex-2023.3.23-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c88e8c226473b5549fe9616980ea7ca09289246cfbdf469241edf4741a620004"}, - {file = "regex-2023.3.23-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6560776ec19c83f3645bbc5db64a7a5816c9d8fb7ed7201c5bcd269323d88072"}, - {file = "regex-2023.3.23-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b1fc2632c01f42e06173d8dd9bb2e74ab9b0afa1d698058c867288d2c7a31f3"}, - {file = "regex-2023.3.23-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fdf7ad455f1916b8ea5cdbc482d379f6daf93f3867b4232d14699867a5a13af7"}, - {file = "regex-2023.3.23-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5fc33b27b1d800fc5b78d7f7d0f287e35079ecabe68e83d46930cf45690e1c8c"}, - {file = "regex-2023.3.23-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c49552dc938e3588f63f8a78c86f3c9c75301e813bca0bef13bdb4b87ccf364"}, - {file = "regex-2023.3.23-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e152461e9a0aedec7d37fc66ec0fa635eca984777d3d3c3e36f53bf3d3ceb16e"}, - {file = "regex-2023.3.23-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:db034255e72d2995cf581b14bb3fc9c00bdbe6822b49fcd4eef79e1d5f232618"}, - {file = "regex-2023.3.23-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:55ae114da21b7a790b90255ea52d2aa3a0d121a646deb2d3c6a3194e722fc762"}, - {file = "regex-2023.3.23-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ef3f528fe1cc3d139508fe1b22523745aa77b9d6cb5b0bf277f48788ee0b993f"}, - {file = "regex-2023.3.23-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:a81c9ec59ca2303acd1ccd7b9ac409f1e478e40e96f8f79b943be476c5fdb8bb"}, - {file = "regex-2023.3.23-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cde09c4fdd070772aa2596d97e942eb775a478b32459e042e1be71b739d08b77"}, - {file = "regex-2023.3.23-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3cd9f5dd7b821f141d3a6ca0d5d9359b9221e4f051ca3139320adea9f1679691"}, - {file = "regex-2023.3.23-cp39-cp39-win32.whl", hash = "sha256:7304863f3a652dab5e68e6fb1725d05ebab36ec0390676d1736e0571ebb713ef"}, - {file = "regex-2023.3.23-cp39-cp39-win_amd64.whl", hash = "sha256:54c3fa855a3f7438149de3211738dd9b5f0c733f48b54ae05aa7fce83d48d858"}, - {file = "regex-2023.3.23.tar.gz", hash = "sha256:dc80df325b43ffea5cdea2e3eaa97a44f3dd298262b1c7fe9dbb2a9522b956a7"}, + {file = "regex-2023.5.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:48c9ec56579d4ba1c88f42302194b8ae2350265cb60c64b7b9a88dcb7fbde309"}, + {file = "regex-2023.5.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02f4541550459c08fdd6f97aa4e24c6f1932eec780d58a2faa2068253df7d6ff"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53e22e4460f0245b468ee645156a4f84d0fc35a12d9ba79bd7d79bdcd2f9629d"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b870b6f632fc74941cadc2a0f3064ed8409e6f8ee226cdfd2a85ae50473aa94"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:171c52e320fe29260da550d81c6b99f6f8402450dc7777ef5ced2e848f3b6f8f"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aad5524c2aedaf9aa14ef1bc9327f8abd915699dea457d339bebbe2f0d218f86"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a0f874ee8c0bc820e649c900243c6d1e6dc435b81da1492046716f14f1a2a96"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e645c757183ee0e13f0bbe56508598e2d9cd42b8abc6c0599d53b0d0b8dd1479"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a4c5da39bca4f7979eefcbb36efea04471cd68db2d38fcbb4ee2c6d440699833"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5e3f4468b8c6fd2fd33c218bbd0a1559e6a6fcf185af8bb0cc43f3b5bfb7d636"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:59e4b729eae1a0919f9e4c0fc635fbcc9db59c74ad98d684f4877be3d2607dd6"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ba73a14e9c8f9ac409863543cde3290dba39098fc261f717dc337ea72d3ebad2"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0bbd5dcb19603ab8d2781fac60114fb89aee8494f4505ae7ad141a3314abb1f9"}, + {file = "regex-2023.5.5-cp310-cp310-win32.whl", hash = "sha256:40005cbd383438aecf715a7b47fe1e3dcbc889a36461ed416bdec07e0ef1db66"}, + {file = "regex-2023.5.5-cp310-cp310-win_amd64.whl", hash = "sha256:59597cd6315d3439ed4b074febe84a439c33928dd34396941b4d377692eca810"}, + {file = "regex-2023.5.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8f08276466fedb9e36e5193a96cb944928301152879ec20c2d723d1031cd4ddd"}, + {file = "regex-2023.5.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cd46f30e758629c3ee91713529cfbe107ac50d27110fdcc326a42ce2acf4dafc"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2910502f718828cecc8beff004917dcf577fc5f8f5dd40ffb1ea7612124547b"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:445d6f4fc3bd9fc2bf0416164454f90acab8858cd5a041403d7a11e3356980e8"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18196c16a584619c7c1d843497c069955d7629ad4a3fdee240eb347f4a2c9dbe"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33d430a23b661629661f1fe8395be2004006bc792bb9fc7c53911d661b69dd7e"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72a28979cc667e5f82ef433db009184e7ac277844eea0f7f4d254b789517941d"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f764e4dfafa288e2eba21231f455d209f4709436baeebb05bdecfb5d8ddc3d35"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23d86ad2121b3c4fc78c58f95e19173790e22ac05996df69b84e12da5816cb17"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:690a17db524ee6ac4a27efc5406530dd90e7a7a69d8360235323d0e5dafb8f5b"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:1ecf3dcff71f0c0fe3e555201cbe749fa66aae8d18f80d2cc4de8e66df37390a"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:811040d7f3dd9c55eb0d8b00b5dcb7fd9ae1761c454f444fd9f37fe5ec57143a"}, + {file = "regex-2023.5.5-cp311-cp311-win32.whl", hash = "sha256:c8c143a65ce3ca42e54d8e6fcaf465b6b672ed1c6c90022794a802fb93105d22"}, + {file = "regex-2023.5.5-cp311-cp311-win_amd64.whl", hash = "sha256:586a011f77f8a2da4b888774174cd266e69e917a67ba072c7fc0e91878178a80"}, + {file = "regex-2023.5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b6365703e8cf1644b82104cdd05270d1a9f043119a168d66c55684b1b557d008"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a56c18f21ac98209da9c54ae3ebb3b6f6e772038681d6cb43b8d53da3b09ee81"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8b942d8b3ce765dbc3b1dad0a944712a89b5de290ce8f72681e22b3c55f3cc8"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:844671c9c1150fcdac46d43198364034b961bd520f2c4fdaabfc7c7d7138a2dd"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c2ce65bdeaf0a386bb3b533a28de3994e8e13b464ac15e1e67e4603dd88787fa"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fee0016cc35a8a91e8cc9312ab26a6fe638d484131a7afa79e1ce6165328a135"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:18f05d14f14a812fe9723f13afafefe6b74ca042d99f8884e62dbd34dcccf3e2"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:941b3f1b2392f0bcd6abf1bc7a322787d6db4e7457be6d1ffd3a693426a755f2"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:921473a93bcea4d00295799ab929522fc650e85c6b9f27ae1e6bb32a790ea7d3"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:e2205a81f815b5bb17e46e74cc946c575b484e5f0acfcb805fb252d67e22938d"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:385992d5ecf1a93cb85adff2f73e0402dd9ac29b71b7006d342cc920816e6f32"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:890a09cb0a62198bff92eda98b2b507305dd3abf974778bae3287f98b48907d3"}, + {file = "regex-2023.5.5-cp36-cp36m-win32.whl", hash = "sha256:821a88b878b6589c5068f4cc2cfeb2c64e343a196bc9d7ac68ea8c2a776acd46"}, + {file = "regex-2023.5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:7918a1b83dd70dc04ab5ed24c78ae833ae8ea228cef84e08597c408286edc926"}, + {file = "regex-2023.5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:338994d3d4ca4cf12f09822e025731a5bdd3a37aaa571fa52659e85ca793fb67"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a69cf0c00c4d4a929c6c7717fd918414cab0d6132a49a6d8fc3ded1988ed2ea"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f5e06df94fff8c4c85f98c6487f6636848e1dc85ce17ab7d1931df4a081f657"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8906669b03c63266b6a7693d1f487b02647beb12adea20f8840c1a087e2dfb5"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fda3e50abad8d0f48df621cf75adc73c63f7243cbe0e3b2171392b445401550"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ac2b7d341dc1bd102be849d6dd33b09701223a851105b2754339e390be0627a"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fb2b495dd94b02de8215625948132cc2ea360ae84fe6634cd19b6567709c8ae2"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:aa7d032c1d84726aa9edeb6accf079b4caa87151ca9fabacef31fa028186c66d"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:3d45864693351c15531f7e76f545ec35000d50848daa833cead96edae1665559"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21e90a288e6ba4bf44c25c6a946cb9b0f00b73044d74308b5e0afd190338297c"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:10250a093741ec7bf74bcd2039e697f519b028518f605ff2aa7ac1e9c9f97423"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6b8d0c153f07a953636b9cdb3011b733cadd4178123ef728ccc4d5969e67f3c2"}, + {file = "regex-2023.5.5-cp37-cp37m-win32.whl", hash = "sha256:10374c84ee58c44575b667310d5bbfa89fb2e64e52349720a0182c0017512f6c"}, + {file = "regex-2023.5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:9b320677521aabf666cdd6e99baee4fb5ac3996349c3b7f8e7c4eee1c00dfe3a"}, + {file = "regex-2023.5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:afb1c70ec1e594a547f38ad6bf5e3d60304ce7539e677c1429eebab115bce56e"}, + {file = "regex-2023.5.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cf123225945aa58b3057d0fba67e8061c62d14cc8a4202630f8057df70189051"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a99757ad7fe5c8a2bb44829fc57ced11253e10f462233c1255fe03888e06bc19"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a623564d810e7a953ff1357f7799c14bc9beeab699aacc8b7ab7822da1e952b8"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ced02e3bd55e16e89c08bbc8128cff0884d96e7f7a5633d3dc366b6d95fcd1d6"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1cbe6b5be3b9b698d8cc4ee4dee7e017ad655e83361cd0ea8e653d65e469468"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a6e4b0e0531223f53bad07ddf733af490ba2b8367f62342b92b39b29f72735a"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2e9c4f778514a560a9c9aa8e5538bee759b55f6c1dcd35613ad72523fd9175b8"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:256f7f4c6ba145f62f7a441a003c94b8b1af78cee2cccacfc1e835f93bc09426"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:bd7b68fd2e79d59d86dcbc1ccd6e2ca09c505343445daaa4e07f43c8a9cc34da"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4a5059bd585e9e9504ef9c07e4bc15b0a621ba20504388875d66b8b30a5c4d18"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:6893544e06bae009916a5658ce7207e26ed17385149f35a3125f5259951f1bbe"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c64d5abe91a3dfe5ff250c6bb267ef00dbc01501518225b45a5f9def458f31fb"}, + {file = "regex-2023.5.5-cp38-cp38-win32.whl", hash = "sha256:7923470d6056a9590247ff729c05e8e0f06bbd4efa6569c916943cb2d9b68b91"}, + {file = "regex-2023.5.5-cp38-cp38-win_amd64.whl", hash = "sha256:4035d6945cb961c90c3e1c1ca2feb526175bcfed44dfb1cc77db4fdced060d3e"}, + {file = "regex-2023.5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:50fd2d9b36938d4dcecbd684777dd12a407add4f9f934f235c66372e630772b0"}, + {file = "regex-2023.5.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d19e57f888b00cd04fc38f5e18d0efbd91ccba2d45039453ab2236e6eec48d4d"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd966475e963122ee0a7118ec9024388c602d12ac72860f6eea119a3928be053"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db09e6c18977a33fea26fe67b7a842f706c67cf8bda1450974d0ae0dd63570df"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6164d4e2a82f9ebd7752a06bd6c504791bedc6418c0196cd0a23afb7f3e12b2d"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84397d3f750d153ebd7f958efaa92b45fea170200e2df5e0e1fd4d85b7e3f58a"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c3efee9bb53cbe7b285760c81f28ac80dc15fa48b5fe7e58b52752e642553f1"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:144b5b017646b5a9392a5554a1e5db0000ae637be4971c9747566775fc96e1b2"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:1189fbbb21e2c117fda5303653b61905aeeeea23de4a94d400b0487eb16d2d60"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f83fe9e10f9d0b6cf580564d4d23845b9d692e4c91bd8be57733958e4c602956"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:72aa4746993a28c841e05889f3f1b1e5d14df8d3daa157d6001a34c98102b393"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:de2f780c3242ea114dd01f84848655356af4dd561501896c751d7b885ea6d3a1"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:290fd35219486dfbc00b0de72f455ecdd63e59b528991a6aec9fdfc0ce85672e"}, + {file = "regex-2023.5.5-cp39-cp39-win32.whl", hash = "sha256:732176f5427e72fa2325b05c58ad0b45af341c459910d766f814b0584ac1f9ac"}, + {file = "regex-2023.5.5-cp39-cp39-win_amd64.whl", hash = "sha256:1307aa4daa1cbb23823d8238e1f61292fd07e4e5d8d38a6efff00b67a7cdb764"}, + {file = "regex-2023.5.5.tar.gz", hash = "sha256:7d76a8a1fc9da08296462a18f16620ba73bcbf5909e42383b253ef34d9d5141e"}, ] [[package]] name = "requests" -version = "2.28.2" +version = "2.30.0" description = "Python HTTP for Humans." category = "dev" optional = false -python-versions = ">=3.7, <4" +python-versions = ">=3.7" files = [ - {file = "requests-2.28.2-py3-none-any.whl", hash = "sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa"}, - {file = "requests-2.28.2.tar.gz", hash = "sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf"}, + {file = "requests-2.30.0-py3-none-any.whl", hash = "sha256:10e94cc4f3121ee6da529d358cdaeaff2f1c409cd377dbc72b825852f2f7e294"}, + {file = "requests-2.30.0.tar.gz", hash = "sha256:239d7d4458afcb28a692cdd298d87542235f4ca8d36d03a15bfc128a6559a2f4"}, ] [package.dependencies] certifi = ">=2017.4.17" charset-normalizer = ">=2,<4" idna = ">=2.5,<4" -urllib3 = ">=1.21.1,<1.27" +urllib3 = ">=1.21.1,<3" [package.extras] socks = ["PySocks (>=1.5.6,!=1.5.7)"] @@ -1216,14 +1249,14 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "requests-toolbelt" -version = "0.10.1" +version = "1.0.0" description = "A utility belt for advanced users of python-requests" category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ - {file = "requests-toolbelt-0.10.1.tar.gz", hash = "sha256:62e09f7ff5ccbda92772a29f394a49c3ad6cb181d568b1337626b2abb628a63d"}, - {file = "requests_toolbelt-0.10.1-py2.py3-none-any.whl", hash = "sha256:18565aa58116d9951ac39baa288d3adb5b3ff975c4f25eee78555d89e8f247f7"}, + {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, + {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, ] [package.dependencies] @@ -1246,14 +1279,14 @@ idna2008 = ["idna"] [[package]] name = "rich" -version = "13.3.2" +version = "13.3.5" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" category = "dev" optional = false python-versions = ">=3.7.0" files = [ - {file = "rich-13.3.2-py3-none-any.whl", hash = "sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f"}, - {file = "rich-13.3.2.tar.gz", hash = "sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001"}, + {file = "rich-13.3.5-py3-none-any.whl", hash = "sha256:69cdf53799e63f38b95b9bf9c875f8c90e78dd62b2f00c13a911c7a3b9fa4704"}, + {file = "rich-13.3.5.tar.gz", hash = "sha256:2d11b9b8dd03868f09b4fffadc84a6a8cda574e40dc90821bd845720ebb8e89c"}, ] [package.dependencies] @@ -1264,6 +1297,28 @@ typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.9 [package.extras] jupyter = ["ipywidgets (>=7.5.1,<9)"] +[[package]] +name = "scikit-build" +version = "0.13.0" +description = "Improved build system generator for Python C/C++/Fortran/Cython extensions" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "scikit-build-0.13.0.tar.gz", hash = "sha256:a6ca1b7f1cc8a718564c19f535014f3a71f34508f72e750d4221f987eed0f06d"}, + {file = "scikit_build-0.13.0-py2.py3-none-any.whl", hash = "sha256:f903fef5cd76aa81dee040fa9cf3daaeff5c71fccfe5fc0bf6a62e54b166d492"}, +] + +[package.dependencies] +distro = "*" +packaging = "*" +setuptools = {version = ">=28.0.0", markers = "python_version >= \"3\""} +wheel = ">=0.29.0" + +[package.extras] +docs = ["pygments", "sphinx (>=4)", "sphinx-issues", "sphinx-rtd-theme (>=1.0)", "sphinxcontrib-moderncmakedomain (>=3.19)"] +test = ["build (>=0.5)", "codecov (>=2.0.5)", "coverage (>=4.2)", "cython (>=0.25.1)", "flake8 (>=3.0.4)", "path.py (>=11.5.0)", "pathlib2", "pytest (>=4.5.0)", "pytest-cov (>=2.7.1)", "pytest-mock (>=1.10.4)", "pytest-runner (>=5.1)", "pytest-virtualenv (>=1.2.5)", "requests", "six (>=1.10.0)", "ubelt (>=0.8.2)", "virtualenv", "xdoctest (>=0.10.0)"] + [[package]] name = "secretstorage" version = "3.3.3" @@ -1280,6 +1335,23 @@ files = [ cryptography = ">=2.0" jeepney = ">=0.6" +[[package]] +name = "setuptools" +version = "67.7.2" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "setuptools-67.7.2-py3-none-any.whl", hash = "sha256:23aaf86b85ca52ceb801d32703f12d77517b2556af839621c641fca11287952b"}, + {file = "setuptools-67.7.2.tar.gz", hash = "sha256:f104fa03692a2602fa0fec6c6a9e63b6c8a968de13e17c026957dd1f53d80990"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] + [[package]] name = "six" version = "1.16.0" @@ -1353,20 +1425,21 @@ files = [ [[package]] name = "urllib3" -version = "1.26.15" +version = "2.0.2" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +python-versions = ">=3.7" files = [ - {file = "urllib3-1.26.15-py2.py3-none-any.whl", hash = "sha256:aa751d169e23c7479ce47a0cb0da579e3ede798f994f5816a74e4f4500dcea42"}, - {file = "urllib3-1.26.15.tar.gz", hash = "sha256:8a388717b9476f934a21484e8c8e61875ab60644d29b9b39e11e4b9dc1c6b305"}, + {file = "urllib3-2.0.2-py3-none-any.whl", hash = "sha256:d055c2f9d38dc53c808f6fdc8eab7360b6fdbbde02340ed25cfbcd817c62469e"}, + {file = "urllib3-2.0.2.tar.gz", hash = "sha256:61717a1095d7e155cdb737ac7bb2f4324a858a1e2e6466f6d03ff630ca68d3cc"}, ] [package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] -socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] [[package]] name = "watchdog" @@ -1420,6 +1493,21 @@ files = [ {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, ] +[[package]] +name = "wheel" +version = "0.40.0" +description = "A built-package format for Python" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "wheel-0.40.0-py3-none-any.whl", hash = "sha256:d236b20e7cb522daf2390fa84c55eea81c5c30190f90f29ae2ca1ad8355bf247"}, + {file = "wheel-0.40.0.tar.gz", hash = "sha256:cd1196f3faee2b31968d626e1731c94f99cbdb67cf5a46e4f5656cbee7738873"}, +] + +[package.extras] +test = ["pytest (>=6.0.0)"] + [[package]] name = "zipp" version = "3.15.0" @@ -1439,4 +1527,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "6bea74d847b958639276d4be527c2b65dafeb0a455b6e3d1f29fee5171ce73b2" +content-hash = "3e4d61460097fb19821cef5a923353aa9e862db0f8a1e755b247a0f8f11cb781" diff --git a/pyproject.toml b/pyproject.toml index 9e63372..cf62d83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ mkdocstrings = {extras = ["python"], version = "^0.21.2"} mkdocs-material = "^9.1.11" pytest = "^7.3.1" httpx = "^0.24.0" +scikit-build = "0.13" [build-system] requires = [ From 7e03fdd766878a723457ef0e6de96d49b0179cb7 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sun, 30 Apr 2023 23:22:54 -0700 Subject: [PATCH 208/443] poetry: add poetry.toml, configure to install in a virtualenv Its cleaner to keep the virtualenv in the project directory, and poetry makes it easy to do that via this config. --- poetry.toml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 poetry.toml diff --git a/poetry.toml b/poetry.toml new file mode 100644 index 0000000..be97f1e --- /dev/null +++ b/poetry.toml @@ -0,0 +1,3 @@ +[virtualenvs] +in-project = true +prefer-active-python = true \ No newline at end of file From bebe7712f75a87e3b6dfecc722cbfe3d1a1ea85f Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sun, 30 Apr 2023 23:28:50 -0700 Subject: [PATCH 209/443] README: better setup instructions for developers for pip and poetry Give folks options + explicit instructions for installing with poetry or pip. --- README.md | 11 ++++ poetry.lock | 145 ++++++++++++++++++++++++++++++++++++++++++++++--- pyproject.toml | 7 ++- 3 files changed, 155 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index b7772d9..ae633f4 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,17 @@ To get started, clone the repository and install the package in development mode ```bash git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git + +# Install with pip +pip install -e . + +# if you want to use the fastapi / openapi server +pip install -e .[server] + +# If you're a poetry user, installing will also include a virtual environment +poetry install --all-extras +. .venv/bin/activate + # Will need to be re-run any time vendor/llama.cpp is updated python3 setup.py develop ``` diff --git a/poetry.lock b/poetry.lock index 861de10..ad59963 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4,7 +4,7 @@ name = "anyio" version = "3.6.2" description = "High level compatibility layer for multiple asynchronous event loop implementations" -category = "dev" +category = "main" optional = false python-versions = ">=3.6.2" files = [ @@ -268,7 +268,7 @@ files = [ name = "click" version = "8.1.3" description = "Composable command line interface toolkit" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -283,7 +283,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "dev" +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -372,6 +372,28 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "fastapi" +version = "0.95.1" +description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" +category = "main" +optional = true +python-versions = ">=3.7" +files = [ + {file = "fastapi-0.95.1-py3-none-any.whl", hash = "sha256:a870d443e5405982e1667dfe372663abf10754f246866056336d7f01c21dab07"}, + {file = "fastapi-0.95.1.tar.gz", hash = "sha256:9569f0a381f8a457ec479d90fa01005cfddaae07546eb1f3fa035bc4797ae7d5"}, +] + +[package.dependencies] +pydantic = ">=1.6.2,<1.7 || >1.7,<1.7.1 || >1.7.1,<1.7.2 || >1.7.2,<1.7.3 || >1.7.3,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0" +starlette = ">=0.26.1,<0.27.0" + +[package.extras] +all = ["email-validator (>=1.1.1)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] +dev = ["pre-commit (>=2.17.0,<3.0.0)", "ruff (==0.0.138)", "uvicorn[standard] (>=0.12.0,<0.21.0)"] +doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-markdownextradata-plugin (>=0.1.7,<0.3.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pyyaml (>=5.3.1,<7.0.0)", "typer-cli (>=0.0.13,<0.0.14)", "typer[all] (>=0.6.1,<0.8.0)"] +test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==23.1.0)", "coverage[toml] (>=6.5.0,<8.0)", "databases[sqlite] (>=0.3.2,<0.7.0)", "email-validator (>=1.1.1,<2.0.0)", "flask (>=1.1.2,<3.0.0)", "httpx (>=0.23.0,<0.24.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.982)", "orjson (>=3.2.1,<4.0.0)", "passlib[bcrypt] (>=1.7.2,<2.0.0)", "peewee (>=3.13.3,<4.0.0)", "pytest (>=7.1.3,<8.0.0)", "python-jose[cryptography] (>=3.3.0,<4.0.0)", "python-multipart (>=0.0.5,<0.0.7)", "pyyaml (>=5.3.1,<7.0.0)", "ruff (==0.0.138)", "sqlalchemy (>=1.3.18,<1.4.43)", "types-orjson (==3.6.2)", "types-ujson (==5.7.0.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,<6.0.0)"] + [[package]] name = "ghp-import" version = "2.1.0" @@ -409,7 +431,7 @@ colorama = ">=0.4" name = "h11" version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -467,7 +489,7 @@ socks = ["socksio (>=1.0.0,<2.0.0)"] name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" -category = "dev" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -961,6 +983,59 @@ files = [ {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] +[[package]] +name = "pydantic" +version = "1.10.7" +description = "Data validation and settings management using python type hints" +category = "main" +optional = true +python-versions = ">=3.7" +files = [ + {file = "pydantic-1.10.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e79e999e539872e903767c417c897e729e015872040e56b96e67968c3b918b2d"}, + {file = "pydantic-1.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:01aea3a42c13f2602b7ecbbea484a98169fb568ebd9e247593ea05f01b884b2e"}, + {file = "pydantic-1.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:516f1ed9bc2406a0467dd777afc636c7091d71f214d5e413d64fef45174cfc7a"}, + {file = "pydantic-1.10.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae150a63564929c675d7f2303008d88426a0add46efd76c3fc797cd71cb1b46f"}, + {file = "pydantic-1.10.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ecbbc51391248116c0a055899e6c3e7ffbb11fb5e2a4cd6f2d0b93272118a209"}, + {file = "pydantic-1.10.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f4a2b50e2b03d5776e7f21af73e2070e1b5c0d0df255a827e7c632962f8315af"}, + {file = "pydantic-1.10.7-cp310-cp310-win_amd64.whl", hash = "sha256:a7cd2251439988b413cb0a985c4ed82b6c6aac382dbaff53ae03c4b23a70e80a"}, + {file = "pydantic-1.10.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:68792151e174a4aa9e9fc1b4e653e65a354a2fa0fed169f7b3d09902ad2cb6f1"}, + {file = "pydantic-1.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe2507b8ef209da71b6fb5f4e597b50c5a34b78d7e857c4f8f3115effaef5fe"}, + {file = "pydantic-1.10.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10a86d8c8db68086f1e30a530f7d5f83eb0685e632e411dbbcf2d5c0150e8dcd"}, + {file = "pydantic-1.10.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d75ae19d2a3dbb146b6f324031c24f8a3f52ff5d6a9f22f0683694b3afcb16fb"}, + {file = "pydantic-1.10.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:464855a7ff7f2cc2cf537ecc421291b9132aa9c79aef44e917ad711b4a93163b"}, + {file = "pydantic-1.10.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:193924c563fae6ddcb71d3f06fa153866423ac1b793a47936656e806b64e24ca"}, + {file = "pydantic-1.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:b4a849d10f211389502059c33332e91327bc154acc1845f375a99eca3afa802d"}, + {file = "pydantic-1.10.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cc1dde4e50a5fc1336ee0581c1612215bc64ed6d28d2c7c6f25d2fe3e7c3e918"}, + {file = "pydantic-1.10.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0cfe895a504c060e5d36b287ee696e2fdad02d89e0d895f83037245218a87fe"}, + {file = "pydantic-1.10.7-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:670bb4683ad1e48b0ecb06f0cfe2178dcf74ff27921cdf1606e527d2617a81ee"}, + {file = "pydantic-1.10.7-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:950ce33857841f9a337ce07ddf46bc84e1c4946d2a3bba18f8280297157a3fd1"}, + {file = "pydantic-1.10.7-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c15582f9055fbc1bfe50266a19771bbbef33dd28c45e78afbe1996fd70966c2a"}, + {file = "pydantic-1.10.7-cp37-cp37m-win_amd64.whl", hash = "sha256:82dffb306dd20bd5268fd6379bc4bfe75242a9c2b79fec58e1041fbbdb1f7914"}, + {file = "pydantic-1.10.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8c7f51861d73e8b9ddcb9916ae7ac39fb52761d9ea0df41128e81e2ba42886cd"}, + {file = "pydantic-1.10.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6434b49c0b03a51021ade5c4daa7d70c98f7a79e95b551201fff682fc1661245"}, + {file = "pydantic-1.10.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64d34ab766fa056df49013bb6e79921a0265204c071984e75a09cbceacbbdd5d"}, + {file = "pydantic-1.10.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:701daea9ffe9d26f97b52f1d157e0d4121644f0fcf80b443248434958fd03dc3"}, + {file = "pydantic-1.10.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cf135c46099ff3f919d2150a948ce94b9ce545598ef2c6c7bf55dca98a304b52"}, + {file = "pydantic-1.10.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b0f85904f73161817b80781cc150f8b906d521fa11e3cdabae19a581c3606209"}, + {file = "pydantic-1.10.7-cp38-cp38-win_amd64.whl", hash = "sha256:9f6f0fd68d73257ad6685419478c5aece46432f4bdd8d32c7345f1986496171e"}, + {file = "pydantic-1.10.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c230c0d8a322276d6e7b88c3f7ce885f9ed16e0910354510e0bae84d54991143"}, + {file = "pydantic-1.10.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:976cae77ba6a49d80f461fd8bba183ff7ba79f44aa5cfa82f1346b5626542f8e"}, + {file = "pydantic-1.10.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d45fc99d64af9aaf7e308054a0067fdcd87ffe974f2442312372dfa66e1001d"}, + {file = "pydantic-1.10.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2a5ebb48958754d386195fe9e9c5106f11275867051bf017a8059410e9abf1f"}, + {file = "pydantic-1.10.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:abfb7d4a7cd5cc4e1d1887c43503a7c5dd608eadf8bc615413fc498d3e4645cd"}, + {file = "pydantic-1.10.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:80b1fab4deb08a8292d15e43a6edccdffa5377a36a4597bb545b93e79c5ff0a5"}, + {file = "pydantic-1.10.7-cp39-cp39-win_amd64.whl", hash = "sha256:d71e69699498b020ea198468e2480a2f1e7433e32a3a99760058c6520e2bea7e"}, + {file = "pydantic-1.10.7-py3-none-any.whl", hash = "sha256:0cd181f1d0b1d00e2b705f1bf1ac7799a2d938cce3376b8007df62b29be3c2c6"}, + {file = "pydantic-1.10.7.tar.gz", hash = "sha256:cfc83c0678b6ba51b0532bea66860617c4cd4251ecf76e9846fa5a9f3454e97e"}, +] + +[package.dependencies] +typing-extensions = ">=4.2.0" + +[package.extras] +dotenv = ["python-dotenv (>=0.10.4)"] +email = ["email-validator (>=1.0.3)"] + [[package]] name = "pygments" version = "2.15.1" @@ -1368,7 +1443,7 @@ files = [ name = "sniffio" version = "1.3.0" description = "Sniff out which async library your code is running under" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1376,6 +1451,40 @@ files = [ {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, ] +[[package]] +name = "sse-starlette" +version = "1.5.0" +description = "\"SSE plugin for Starlette\"" +category = "main" +optional = true +python-versions = ">=3.8" +files = [ + {file = "sse-starlette-1.5.0.tar.gz", hash = "sha256:4fa989d906f29ba456a047071cbd9eab8c934042d5da4660543ad4b61c59c092"}, + {file = "sse_starlette-1.5.0-py3-none-any.whl", hash = "sha256:b41aac15f83191a4fc381e8cd152285cd44e328f409dc2bdfd4b7d7f33ea3865"}, +] + +[package.dependencies] +starlette = "*" + +[[package]] +name = "starlette" +version = "0.26.1" +description = "The little ASGI library that shines." +category = "main" +optional = true +python-versions = ">=3.7" +files = [ + {file = "starlette-0.26.1-py3-none-any.whl", hash = "sha256:e87fce5d7cbdde34b76f0ac69013fd9d190d581d80681493016666e6f96c6d5e"}, + {file = "starlette-0.26.1.tar.gz", hash = "sha256:41da799057ea8620e4667a3e69a5b1923ebd32b1819c8fa75634bbe8d8bea9bd"}, +] + +[package.dependencies] +anyio = ">=3.4.0,<5" +typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""} + +[package.extras] +full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"] + [[package]] name = "tomli" version = "2.0.1" @@ -1441,6 +1550,25 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17. socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "uvicorn" +version = "0.21.1" +description = "The lightning-fast ASGI server." +category = "main" +optional = true +python-versions = ">=3.7" +files = [ + {file = "uvicorn-0.21.1-py3-none-any.whl", hash = "sha256:e47cac98a6da10cd41e6fd036d472c6f58ede6c5dbee3dbee3ef7a100ed97742"}, + {file = "uvicorn-0.21.1.tar.gz", hash = "sha256:0fac9cb342ba099e0d582966005f3fdba5b0290579fed4a6266dc702ca7bb032"}, +] + +[package.dependencies] +click = ">=7.0" +h11 = ">=0.8" + +[package.extras] +standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] + [[package]] name = "watchdog" version = "3.0.0" @@ -1524,7 +1652,10 @@ files = [ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] +[extras] +server = ["fastapi", "sse-starlette", "uvicorn"] + [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "3e4d61460097fb19821cef5a923353aa9e862db0f8a1e755b247a0f8f11cb781" +content-hash = "b1b158e4c9640e4dc197fe43e22c9f87e6e90945ec9b8bcba6042f81249d251e" diff --git a/pyproject.toml b/pyproject.toml index cf62d83..8aec94c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,9 @@ include = [ [tool.poetry.dependencies] python = "^3.8.1" typing-extensions = "^4.5.0" - +uvicorn = { version = "^0.21.1", optional = true } +fastapi = { version = "^0.95.0", optional = true } +sse-starlette = { version = "^1.3.3", optional = true } [tool.poetry.group.dev.dependencies] black = "^23.3.0" @@ -27,6 +29,9 @@ pytest = "^7.3.1" httpx = "^0.24.0" scikit-build = "0.13" +[tool.poetry.extras] +server = ["uvicorn", "fastapi", "sse-starlette"] + [build-system] requires = [ "setuptools>=42", From 02e8a018ae1da78e66e4843ada1c62fa7c2cb4a2 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Tue, 9 May 2023 16:19:46 -0700 Subject: [PATCH 210/443] llama_cpp server: document presence_penalty and frequency_penalty, mark as supported --- llama_cpp/server/app.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 621b73e..be6bac8 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -152,6 +152,19 @@ repeat_penalty_field = Field( + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.", ) +presence_penalty_field = Field( + default=0.0, + ge=-2.0, + le=2.0, + description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", +) + +frequency_penalty_field = Field( + default=0.0, + ge=-2.0, + le=2.0, + description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", +) class CreateCompletionRequest(BaseModel): prompt: Optional[str] = Field( @@ -175,13 +188,13 @@ class CreateCompletionRequest(BaseModel): ge=0, description="The number of logprobs to generate. If None, no logprobs are generated.", ) + presence_penalty: Optional[float] = presence_penalty_field + frequency_penalty: Optional[float] = frequency_penalty_field # ignored or currently unsupported model: Optional[str] = model_field n: Optional[int] = 1 logprobs: Optional[int] = Field(None) - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 best_of: Optional[int] = 1 logit_bias: Optional[Dict[str, float]] = Field(None) user: Optional[str] = Field(None) @@ -269,12 +282,12 @@ class CreateChatCompletionRequest(BaseModel): top_p: float = top_p_field stop: Optional[List[str]] = stop_field stream: bool = stream_field + presence_penalty: Optional[float] = presence_penalty_field + frequency_penalty: Optional[float] = frequency_penalty_field # ignored or currently unsupported model: Optional[str] = model_field n: Optional[int] = 1 - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 logit_bias: Optional[Dict[str, float]] = Field(None) user: Optional[str] = Field(None) From cdeaded2516509b10e9494c2884dcef8b06a4fe2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 10 May 2023 16:12:17 -0400 Subject: [PATCH 211/443] Bugfix: Ensure logs are printed when streaming --- llama_cpp/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7be51e1..bbce4b2 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -709,6 +709,9 @@ class Llama: print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() + if self.verbose: + llama_cpp.llama_print_timings(self.ctx) + if stream: yield { "id": completion_id, @@ -780,9 +783,6 @@ class Llama: "top_logprobs": top_logprobs, } - if self.verbose: - llama_cpp.llama_print_timings(self.ctx) - yield { "id": completion_id, "object": "text_completion", From 35229f5eab0cff843a63508321220f66c1b0f9f6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 11 May 2023 10:05:34 -0400 Subject: [PATCH 212/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1b0fd45..cf348a6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25 +Subproject commit cf348a60e0af3905acd1d297cb064b918265b7ac From 28ee2adec2ef8a685f6a8d0ce73c6c3c7a136532 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 11 May 2023 21:15:12 -0400 Subject: [PATCH 213/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index cf348a6..b608b55 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit cf348a60e0af3905acd1d297cb064b918265b7ac +Subproject commit b608b55a3ea8e4760c617418538465449175bdb8 From 7be584fe82c7c24f0aab9fc8e23b83820600ed52 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 11 May 2023 21:56:19 -0400 Subject: [PATCH 214/443] Add missing tfs_z paramter --- llama_cpp/llama.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index bbce4b2..47fa543 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -437,6 +437,7 @@ class Llama: reset: bool = True, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, @@ -495,6 +496,7 @@ class Llama: repeat_penalty=repeat_penalty, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, @@ -576,6 +578,7 @@ class Llama: repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, @@ -634,6 +637,7 @@ class Llama: top_k=top_k, top_p=top_p, temp=temperature, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, @@ -818,6 +822,7 @@ class Llama: repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, @@ -858,6 +863,7 @@ class Llama: repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, @@ -883,6 +889,7 @@ class Llama: repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, @@ -923,6 +930,7 @@ class Llama: repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, @@ -998,6 +1006,7 @@ class Llama: presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, @@ -1035,6 +1044,7 @@ class Llama: repeat_penalty=repeat_penalty, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, From e3d3c31da24655d27992874f5917657e197587d8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 11 May 2023 21:56:43 -0400 Subject: [PATCH 215/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9e63372..694f4e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.48" +version = "0.1.49" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index f4cbb60..de7bcdc 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.48", + version="0.1.49", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 684d7c8c17a1c50cff2703ae5982390111c991dc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 11 May 2023 22:12:35 -0400 Subject: [PATCH 216/443] Fix docker command --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9f494f9..5c1dd91 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server: ```bash -docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest +docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest ``` ## Low-level API From 8895b9002acefbccfee0cfc36f22ede7410b64e2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 12 May 2023 07:16:57 -0400 Subject: [PATCH 217/443] Revert "llama_cpp server: prompt is a string". Closes #187 This reverts commit b9098b0ef7309b63ebff99cdfadf641223c15025. --- llama_cpp/server/app.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index be6bac8..b1237e5 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -167,8 +167,9 @@ frequency_penalty_field = Field( ) class CreateCompletionRequest(BaseModel): - prompt: Optional[str] = Field( - default="", description="The prompt to generate completions for." + prompt: Union[str, List[str]] = Field( + default="", + description="The prompt to generate completions for." ) suffix: Optional[str] = Field( default=None, @@ -222,6 +223,9 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) def create_completion( request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) ): + if isinstance(request.prompt, list): + request.prompt = "".join(request.prompt) + completion_or_chunks = llama( **request.dict( exclude={ From 8740ddc58e750c243721ba56a7b0b73b8359fdef Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 12 May 2023 07:21:46 -0400 Subject: [PATCH 218/443] Only support generating one prompt at a time. --- llama_cpp/server/app.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index b1237e5..3415a5a 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -166,10 +166,10 @@ frequency_penalty_field = Field( description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", ) + class CreateCompletionRequest(BaseModel): prompt: Union[str, List[str]] = Field( - default="", - description="The prompt to generate completions for." + default="", description="The prompt to generate completions for." ) suffix: Optional[str] = Field( default=None, @@ -224,7 +224,8 @@ def create_completion( request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) ): if isinstance(request.prompt, list): - request.prompt = "".join(request.prompt) + assert len(request.prompt) <= 1 + request.prompt = request.prompt[0] if len(request.prompt) > 0 else "" completion_or_chunks = llama( **request.dict( From 7a536e86c260872c0551e52df37ba8b45317068e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 12 May 2023 14:28:22 -0400 Subject: [PATCH 219/443] Allow model to tokenize strings longer than context length and set add_bos. Closes #92 --- llama_cpp/llama.py | 20 +++++++++++++++++--- llama_cpp/llama_cpp.py | 2 +- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 47fa543..4295ba7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -174,7 +174,9 @@ class Llama: if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) - def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]: + def tokenize( + self, text: bytes, add_bos: bool = True + ) -> List[llama_cpp.llama_token]: """Tokenize a string. Args: @@ -194,10 +196,22 @@ class Llama: text, tokens, n_ctx, - llama_cpp.c_bool(True), + llama_cpp.c_bool(add_bos), ) if int(n_tokens) < 0: - raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}') + n_tokens = abs(n_tokens) + tokens = (llama_cpp.llama_token * int(n_tokens))() + n_tokens = llama_cpp.llama_tokenize( + self.ctx, + text, + tokens, + llama_cpp.c_int(n_tokens), + llama_cpp.c_bool(add_bos), + ) + if n_tokens < 0: + raise RuntimeError( + f'Failed to tokenize: text="{text}" n_tokens={n_tokens}' + ) return list(tokens[:n_tokens]) def detokenize(self, tokens: List[llama_cpp.llama_token]) -> bytes: diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index e60558c..870eced 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -350,7 +350,7 @@ def llama_tokenize( tokens, # type: Array[llama_token] n_max_tokens: c_int, add_bos: c_bool, -) -> c_int: +) -> int: return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos) From cdf59768f52cbf3e54bfe2877d0e5cd3049c04a6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 14 May 2023 00:04:22 -0400 Subject: [PATCH 220/443] Update llama.cpp --- llama_cpp/llama.py | 4 ++++ llama_cpp/llama_cpp.py | 9 +++++---- llama_cpp/server/app.py | 6 ++++++ vendor/llama.cpp | 2 +- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4295ba7..362ebd9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -83,6 +83,7 @@ class Llama: # NOTE: These parameters are likely to change in the future. n_ctx: int = 512, n_parts: int = -1, + n_gpu_layers: int = 0, seed: int = 1337, f16_kv: bool = True, logits_all: bool = False, @@ -129,6 +130,7 @@ class Llama: self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx self.params.n_parts = n_parts + self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv self.params.logits_all = logits_all @@ -1081,6 +1083,7 @@ class Llama: model_path=self.model_path, n_ctx=self.params.n_ctx, n_parts=self.params.n_parts, + n_gpu_layers=self.params.n_gpu_layers, seed=self.params.seed, f16_kv=self.params.f16_kv, logits_all=self.params.logits_all, @@ -1100,6 +1103,7 @@ class Llama: model_path=state["model_path"], n_ctx=state["n_ctx"], n_parts=state["n_parts"], + n_gpu_layers=state["n_gpu_layers"], seed=state["seed"], f16_kv=state["f16_kv"], logits_all=state["logits_all"], diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 870eced..71e78d9 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -68,7 +68,7 @@ _lib_base_name = "llama" _lib = _load_shared_library(_lib_base_name) # C types -LLAMA_FILE_VERSION = c_int(1) +LLAMA_FILE_VERSION = c_int(2) LLAMA_FILE_MAGIC = b"ggjt" LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" LLAMA_SESSION_MAGIC = b"ggsn" @@ -109,6 +109,7 @@ class llama_context_params(Structure): _fields_ = [ ("n_ctx", c_int), # text context ("n_parts", c_int), # -1 for default + ("n_gpu_layers", c_int), # number of layers to store in VRAM ("seed", c_int), # RNG seed, 0 for random ("f16_kv", c_bool), # use fp16 for KV cache ( @@ -135,7 +136,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( 4 ) # tok_embeddings.weight and output.weight are F16 -LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors # LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors @@ -259,9 +260,9 @@ _lib.llama_get_state_size.restype = c_size_t # Destination needs to have allocated enough memory. # Returns the number of bytes copied def llama_copy_state_data( - ctx: llama_context_p, dest # type: Array[c_uint8] + ctx: llama_context_p, dst # type: Array[c_uint8] ) -> int: - return _lib.llama_copy_state_data(ctx, dest) + return _lib.llama_copy_state_data(ctx, dst) _lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 3415a5a..8a83674 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -17,6 +17,11 @@ class Settings(BaseSettings): description="The path to the model to use for generating completions." ) n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_gpu_layers: int = Field( + default=0, + ge=0, + description="The number of layers to put on the GPU. The rest will be on the CPU.", + ) n_batch: int = Field( default=512, ge=1, description="The batch size to use per eval." ) @@ -80,6 +85,7 @@ def create_app(settings: Optional[Settings] = None): global llama llama = llama_cpp.Llama( model_path=settings.model, + n_gpu_layers=settings.n_gpu_layers, f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, use_mmap=settings.use_mmap, diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b608b55..08737ef 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b608b55a3ea8e4760c617418538465449175bdb8 +Subproject commit 08737ef720f0510c7ec2aa84d7f70c691073c35d From d90c9df32639397078a439c86ac6a474043ce57d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 14 May 2023 00:04:49 -0400 Subject: [PATCH 221/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 694f4e3..9a70190 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.49" +version = "0.1.50" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index de7bcdc..b056ce4 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.49", + version="0.1.50", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From ceec21f1e990a1ec96704e0768a22822a26ce2be Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 14 May 2023 22:07:35 -0400 Subject: [PATCH 222/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 08737ef..b5c9295 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 08737ef720f0510c7ec2aa84d7f70c691073c35d +Subproject commit b5c9295eef2b56e307393b35b3a923e3518d226e From c804efe3f01fed7f1a098c75f4972d9a7fba2f46 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 14 May 2023 22:08:11 -0400 Subject: [PATCH 223/443] Fix obscure Wndows DLL issue. Closes #208 --- llama_cpp/llama_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 71e78d9..ab74c41 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -52,7 +52,7 @@ def _load_shared_library(lib_base_name: str): for _lib_path in _lib_paths: if _lib_path.exists(): try: - return ctypes.CDLL(str(_lib_path)) + return ctypes.CDLL(str(_lib_path), winmode=0) except Exception as e: raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") From 3718799b371b77d09a86d566f6a62badd3d64988 Mon Sep 17 00:00:00 2001 From: Anchen Date: Mon, 15 May 2023 20:46:59 +1000 Subject: [PATCH 224/443] chore: add note for Mac m1 installation --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 5c1dd91..1f84946 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,12 @@ pip install llama-cpp-python The above command will attempt to install the package and build build `llama.cpp` from source. This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. +Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: +``` +wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh +bash Miniforge3-MacOSX-arm64.sh +``` +Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac. ### Installation with OpenBLAS / cuBLAS / CLBlast From cbac19bf248671d26db373334a4ed82debe7bebd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 15 May 2023 09:15:01 -0400 Subject: [PATCH 225/443] Add winmode arg only on windows if python version supports it --- llama_cpp/llama_cpp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index ab74c41..3bd4cac 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -44,15 +44,17 @@ def _load_shared_library(lib_base_name: str): _base_path = _lib.parent.resolve() _lib_paths = [_lib.resolve()] + cdll_args = dict() # type: ignore # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) + cdll_args["winmode"] = 0 # Try to load the shared library, handling potential errors for _lib_path in _lib_paths: if _lib_path.exists(): try: - return ctypes.CDLL(str(_lib_path), winmode=0) + return ctypes.CDLL(str(_lib_path), **cdll_args) except Exception as e: raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") From 7526b3f6f96a859186855828615b9e9192f74902 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 May 2023 21:05:54 +0000 Subject: [PATCH 226/443] Bump mkdocs-material from 9.1.11 to 9.1.12 Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.11 to 9.1.12. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.11...9.1.12) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 5474bf4..5289b29 100644 --- a/poetry.lock +++ b/poetry.lock @@ -773,14 +773,14 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.11" +version = "9.1.12" description = "Documentation that simply works" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.11-py3-none-any.whl", hash = "sha256:fbc86d50ec2cf34d40d5c4365780f290ceedde23f1a0704323b34e7f16b0c0dd"}, - {file = "mkdocs_material-9.1.11.tar.gz", hash = "sha256:f5d473eb79d6640a5e668d4b2ab5b9de5e76ae0a0e2d864112df0cfe9016dc1d"}, + {file = "mkdocs_material-9.1.12-py3-none-any.whl", hash = "sha256:68c57d95d10104179c8c3ce9a88ee9d2322a5145b3d0f1f38ff686253fb5ec98"}, + {file = "mkdocs_material-9.1.12.tar.gz", hash = "sha256:d4ebe9b5031ce63a265c19fb5eab4d27ea4edadb05de206372e831b2b7570fb5"}, ] [package.dependencies] @@ -1439,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "6bea74d847b958639276d4be527c2b65dafeb0a455b6e3d1f29fee5171ce73b2" +content-hash = "d188fc14200f7ee348bef821265d676d584762983bcaf10f90c14221b4ed26a9" diff --git a/pyproject.toml b/pyproject.toml index 9a70190..6613ee0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.21.2"} -mkdocs-material = "^9.1.11" +mkdocs-material = "^9.1.12" pytest = "^7.3.1" httpx = "^0.24.0" From 408dd14e5b0a5e9a4c03692c79e4538995ebb191 Mon Sep 17 00:00:00 2001 From: Xiyou Zhou Date: Mon, 15 May 2023 14:52:25 -0700 Subject: [PATCH 227/443] Update README.md Fix typo. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1f84946..4380ca5 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize >>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params) >>> max_tokens = params.n_ctx # use ctypes arrays for array params ->>> tokens = (llama_cppp.llama_token * int(max_tokens))() +>>> tokens = (llama_cpp.llama_token * int(max_tokens))() >>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) >>> llama_cpp.llama_free(ctx) ``` From 341c50b5b0b63a36c30f54f6e90b88e8a0956c15 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 16 May 2023 09:07:14 -0400 Subject: [PATCH 228/443] Fix CMakeLists.txt --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bda2388..16932b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,4 +28,4 @@ else() LIBRARY DESTINATION llama_cpp RUNTIME DESTINATION llama_cpp ) -endif(UNIX) +endif() From a65125c0bd28ee2b787a3d53688fb67b0f871b9e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 16 May 2023 09:35:50 -0400 Subject: [PATCH 229/443] Add sampling defaults for generate --- llama_cpp/llama.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 362ebd9..6c4e153 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -446,10 +446,10 @@ class Llama: def generate( self, tokens: Sequence[llama_cpp.llama_token], - top_k: int, - top_p: float, - temp: float, - repeat_penalty: float, + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, reset: bool = True, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, From 214589e462efb75823abb02aca4f1cc8d213a548 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 16 May 2023 17:20:45 -0400 Subject: [PATCH 230/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b5c9295..4262742 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b5c9295eef2b56e307393b35b3a923e3518d226e +Subproject commit 42627421ece816e632e6a0d757fa75150c687f87 From a3352923c7fe5a7b2a2db714bdc333512c61a62e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 16 May 2023 17:22:00 -0400 Subject: [PATCH 231/443] Add model_alias option to override model_path in completions. Closes #39 --- llama_cpp/llama.py | 19 ++++++++++++++----- llama_cpp/server/app.py | 24 ++++++++++++++++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6c4e153..48fde53 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -522,7 +522,7 @@ class Llama: if tokens_or_none is not None: tokens.extend(tokens_or_none) - def create_embedding(self, input: str) -> Embedding: + def create_embedding(self, input: str, model: Optional[str] = None) -> Embedding: """Embed a string. Args: @@ -532,6 +532,7 @@ class Llama: An embedding object. """ assert self.ctx is not None + _model: str = model if model is not None else self.model_path if self.params.embedding == False: raise RuntimeError( @@ -561,7 +562,7 @@ class Llama: "index": 0, } ], - "model": self.model_path, + "model": _model, "usage": { "prompt_tokens": n_tokens, "total_tokens": n_tokens, @@ -598,6 +599,7 @@ class Llama: mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None completion_id: str = f"cmpl-{str(uuid.uuid4())}" @@ -610,6 +612,7 @@ class Llama: text: bytes = b"" returned_characters: int = 0 stop = stop if stop is not None else [] + _model: str = model if model is not None else self.model_path if self.verbose: llama_cpp.llama_reset_timings(self.ctx) @@ -708,7 +711,7 @@ class Llama: "id": completion_id, "object": "text_completion", "created": created, - "model": self.model_path, + "model": _model, "choices": [ { "text": text[start:].decode("utf-8", errors="ignore"), @@ -737,7 +740,7 @@ class Llama: "id": completion_id, "object": "text_completion", "created": created, - "model": self.model_path, + "model": _model, "choices": [ { "text": text[returned_characters:].decode( @@ -807,7 +810,7 @@ class Llama: "id": completion_id, "object": "text_completion", "created": created, - "model": self.model_path, + "model": _model, "choices": [ { "text": text_str, @@ -842,6 +845,7 @@ class Llama: mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -883,6 +887,7 @@ class Llama: mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + model=model, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks @@ -909,6 +914,7 @@ class Llama: mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -950,6 +956,7 @@ class Llama: mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + model=model, ) def _convert_text_completion_to_chat( @@ -1026,6 +1033,7 @@ class Llama: mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -1064,6 +1072,7 @@ class Llama: mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + model=model, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 8a83674..e8f62e8 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -16,6 +16,10 @@ class Settings(BaseSettings): model: str = Field( description="The path to the model to use for generating completions." ) + model_alias: Optional[str] = Field( + default=None, + description="The alias of the model to use for generating completions.", + ) n_ctx: int = Field(default=2048, ge=1, description="The context size.") n_gpu_layers: int = Field( default=0, @@ -64,6 +68,7 @@ class Settings(BaseSettings): router = APIRouter() +settings: Optional[Settings] = None llama: Optional[llama_cpp.Llama] = None @@ -101,6 +106,12 @@ def create_app(settings: Optional[Settings] = None): if settings.cache: cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) llama.set_cache(cache) + + def set_settings(_settings: Settings): + global settings + settings = _settings + + set_settings(settings) return app @@ -112,6 +123,10 @@ def get_llama(): yield llama +def get_settings(): + yield settings + + model_field = Field(description="The model to use for generating completions.") max_tokens_field = Field( @@ -236,7 +251,6 @@ def create_completion( completion_or_chunks = llama( **request.dict( exclude={ - "model", "n", "best_of", "logit_bias", @@ -274,7 +288,7 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - return llama.create_embedding(**request.dict(exclude={"model", "user"})) + return llama.create_embedding(**request.dict(exclude={"user"})) class ChatCompletionRequestMessage(BaseModel): @@ -335,7 +349,6 @@ def create_chat_completion( completion_or_chunks = llama.create_chat_completion( **request.dict( exclude={ - "model", "n", "logit_bias", "user", @@ -378,13 +391,16 @@ GetModelResponse = create_model_from_typeddict(ModelList) @router.get("/v1/models", response_model=GetModelResponse) def get_models( + settings: Settings = Depends(get_settings), llama: llama_cpp.Llama = Depends(get_llama), ) -> ModelList: return { "object": "list", "data": [ { - "id": llama.model_path, + "id": settings.model_alias + if settings.model_alias is not None + else llama.model_path, "object": "model", "owned_by": "me", "permissions": [], From a7c9e3828735d550f67aac858c0819b7d98e652f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 16 May 2023 18:07:25 -0400 Subject: [PATCH 232/443] Update variable name --- llama_cpp/llama.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 48fde53..054bdc2 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -532,7 +532,7 @@ class Llama: An embedding object. """ assert self.ctx is not None - _model: str = model if model is not None else self.model_path + model_name: str = model if model is not None else self.model_path if self.params.embedding == False: raise RuntimeError( @@ -562,7 +562,7 @@ class Llama: "index": 0, } ], - "model": _model, + "model": model_name, "usage": { "prompt_tokens": n_tokens, "total_tokens": n_tokens, @@ -612,7 +612,7 @@ class Llama: text: bytes = b"" returned_characters: int = 0 stop = stop if stop is not None else [] - _model: str = model if model is not None else self.model_path + model_name: str = model if model is not None else self.model_path if self.verbose: llama_cpp.llama_reset_timings(self.ctx) @@ -711,7 +711,7 @@ class Llama: "id": completion_id, "object": "text_completion", "created": created, - "model": _model, + "model": model_name, "choices": [ { "text": text[start:].decode("utf-8", errors="ignore"), @@ -740,7 +740,7 @@ class Llama: "id": completion_id, "object": "text_completion", "created": created, - "model": _model, + "model": model_name, "choices": [ { "text": text[returned_characters:].decode( @@ -810,7 +810,7 @@ class Llama: "id": completion_id, "object": "text_completion", "created": created, - "model": _model, + "model": model_name, "choices": [ { "text": text_str, From e37a808bc012bf660deb071e2126c38f92d7c2f3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 16 May 2023 23:33:53 -0400 Subject: [PATCH 233/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4262742..2b26469 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 42627421ece816e632e6a0d757fa75150c687f87 +Subproject commit 2b2646931bd2a2eb3e21c6f3733cc0e090b2e24b From 7e5524454052030b6ed31aa3fab28554955338a8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 17 May 2023 01:41:42 -0400 Subject: [PATCH 234/443] Fix top_k value. Closes #220 --- llama_cpp/llama.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 054bdc2..4c8ba39 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -295,6 +295,7 @@ class Llama: assert self.ctx is not None assert len(self.eval_logits) > 0 n_vocab = int(llama_cpp.llama_n_vocab(self.ctx)) + top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k logits = self.eval_logits[-1] data = (llama_cpp.llama_token_data * n_vocab)( *[ From f11e2a781c6f3a6de03d67a52fa529e6c147f1b3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 17 May 2023 01:42:51 -0400 Subject: [PATCH 235/443] Fix last_n_tokens_size --- llama_cpp/llama.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4c8ba39..44363a8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -295,7 +295,9 @@ class Llama: assert self.ctx is not None assert len(self.eval_logits) > 0 n_vocab = int(llama_cpp.llama_n_vocab(self.ctx)) + n_ctx = int(llama_cpp.llama_n_ctx(self.ctx)) top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k + last_n_tokens_size = llama_cpp.c_int(n_ctx) if last_n_tokens_size.value < 0 else last_n_tokens_size logits = self.eval_logits[-1] data = (llama_cpp.llama_token_data * n_vocab)( *[ From d28b753ed2c49998dfc6e5ee83d91c6488f4f15d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 17 May 2023 01:53:26 -0400 Subject: [PATCH 236/443] Implement penalize_nl --- llama_cpp/llama.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 44363a8..7e17d36 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -291,6 +291,7 @@ class Llama: mirostat_mode: llama_cpp.c_int, mirostat_tau: llama_cpp.c_float, mirostat_eta: llama_cpp.c_float, + penalize_nl: bool = True, ): assert self.ctx is not None assert len(self.eval_logits) > 0 @@ -299,6 +300,7 @@ class Llama: top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k last_n_tokens_size = llama_cpp.c_int(n_ctx) if last_n_tokens_size.value < 0 else last_n_tokens_size logits = self.eval_logits[-1] + nl_logit = logits[llama_cpp.llama_token_nl().value] data = (llama_cpp.llama_token_data * n_vocab)( *[ llama_cpp.llama_token_data( @@ -331,6 +333,8 @@ class Llama: alpha_frequency=frequency_penalty, alpha_presence=presence_penalty, ) + if not penalize_nl: + candidates.data[llama_cpp.llama_token_nl().value].logit = nl_logit if temp.value == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, @@ -413,6 +417,7 @@ class Llama: mirostat_mode: int = 0, mirostat_eta: float = 0.1, mirostat_tau: float = 5.0, + penalize_nl: bool = True, ): """Sample a token from the model. @@ -444,6 +449,7 @@ class Llama: mirostat_mode=llama_cpp.c_int(mirostat_mode), mirostat_tau=llama_cpp.c_float(mirostat_tau), mirostat_eta=llama_cpp.c_float(mirostat_eta), + penalize_nl=penalize_nl, ) def generate( @@ -1170,6 +1176,11 @@ class Llama: """Return the beginning-of-sequence token.""" return llama_cpp.llama_token_bos() + @staticmethod + def token_nl() -> llama_cpp.llama_token: + """Return the newline token.""" + return llama_cpp.llama_token_nl() + @staticmethod def logits_to_logprobs(logits: List[float]) -> List[float]: exps = [math.exp(float(x)) for x in logits] From f5c2f998ab10fb1e04a8219678324300d3104243 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 17 May 2023 02:00:39 -0400 Subject: [PATCH 237/443] Format --- llama_cpp/llama.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7e17d36..18dd183 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -298,7 +298,11 @@ class Llama: n_vocab = int(llama_cpp.llama_n_vocab(self.ctx)) n_ctx = int(llama_cpp.llama_n_ctx(self.ctx)) top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k - last_n_tokens_size = llama_cpp.c_int(n_ctx) if last_n_tokens_size.value < 0 else last_n_tokens_size + last_n_tokens_size = ( + llama_cpp.c_int(n_ctx) + if last_n_tokens_size.value < 0 + else last_n_tokens_size + ) logits = self.eval_logits[-1] nl_logit = logits[llama_cpp.llama_token_nl().value] data = (llama_cpp.llama_token_data * n_vocab)( From 4f342795e541a58293acf13891f8d7bb65089784 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 17 May 2023 03:35:13 -0400 Subject: [PATCH 238/443] Update token checks --- llama_cpp/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 18dd183..f47f4a4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -304,7 +304,7 @@ class Llama: else last_n_tokens_size ) logits = self.eval_logits[-1] - nl_logit = logits[llama_cpp.llama_token_nl().value] + nl_logit = logits[int(Llama.token_nl())] data = (llama_cpp.llama_token_data * n_vocab)( *[ llama_cpp.llama_token_data( @@ -338,7 +338,7 @@ class Llama: alpha_presence=presence_penalty, ) if not penalize_nl: - candidates.data[llama_cpp.llama_token_nl().value].logit = nl_logit + candidates.data[int(Llama.token_nl())].logit = nl_logit if temp.value == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, @@ -677,7 +677,7 @@ class Llama: presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, ): - if token == llama_cpp.llama_token_eos(): + if token == Llama.token_eos(): text = self.detokenize(completion_tokens) finish_reason = "stop" break From 70695c430bba71382ec88ba4717b5e85c988245f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 17 May 2023 11:40:12 -0400 Subject: [PATCH 239/443] Move docs link up --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 4380ca5..17cc28c 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ This package provides: - OpenAI-like API - LangChain compatibility +Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). + ## Installation from PyPI (recommended) Install from PyPI (requires a c compiler): From e9794f91f25db70b998a92e962404a7652eda5e5 Mon Sep 17 00:00:00 2001 From: Aneesh Joy Date: Wed, 17 May 2023 18:04:58 +0100 Subject: [PATCH 240/443] Fixd CUBLAS dll load issue in Windows --- llama_cpp/llama_cpp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 3bd4cac..eeda58b 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -48,6 +48,7 @@ def _load_shared_library(lib_base_name: str): # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"],"bin")) cdll_args["winmode"] = 0 # Try to load the shared library, handling potential errors From 61d58e7b352e84a4209794aee91171547e2310f1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 17 May 2023 15:26:38 -0400 Subject: [PATCH 241/443] Check for CUDA_PATH before adding --- llama_cpp/llama_cpp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index eeda58b..24ab40a 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -48,7 +48,9 @@ def _load_shared_library(lib_base_name: str): # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"],"bin")) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"],"bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"],"lib")) cdll_args["winmode"] = 0 # Try to load the shared library, handling potential errors From db10e0078be5eff581c1096c0ba7a9dc3c432505 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 17 May 2023 16:14:01 -0400 Subject: [PATCH 242/443] Update docs --- docs/index.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/index.md b/docs/index.md index c36adff..99b1f59 100644 --- a/docs/index.md +++ b/docs/index.md @@ -112,8 +112,12 @@ python3 setup.py develop show_root_heading: true ::: llama_cpp.LlamaCache + options: + show_root_heading: true ::: llama_cpp.LlamaState + options: + show_root_heading: true ::: llama_cpp.llama_cpp options: From 50e136252a747d451e000e1d09cec64d85c4b2dd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 17 May 2023 16:14:12 -0400 Subject: [PATCH 243/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2b26469..c238b58 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2b2646931bd2a2eb3e21c6f3733cc0e090b2e24b +Subproject commit c238b5873a1ea496db03ffcfe124c9d0d83afbc6 From 6c57d38552346cb5a945925e99e143282084fe4d Mon Sep 17 00:00:00 2001 From: Marcel Coetzee Date: Thu, 18 May 2023 16:02:42 +0200 Subject: [PATCH 244/443] Decrement CUDA version and bump Ubuntu Signed-off-by: Marcel Coetzee --- Dockerfile.cuda | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.cuda b/Dockerfile.cuda index a852f3c..d9451bc 100644 --- a/Dockerfile.cuda +++ b/Dockerfile.cuda @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.1.1-devel-ubuntu20.04 +FROM nvidia/cuda:12.0.1-devel-ubuntu22.04 # We need to set the host to 0.0.0.0 to allow outside access ENV HOST 0.0.0.0 @@ -12,4 +12,4 @@ RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fa RUN LLAMA_CUBLAS=1 python3 setup.py develop # Run the server -CMD python3 -m llama_cpp.server \ No newline at end of file +CMD python3 -m llama_cpp.server From 6ece8a225a9cad89eec41f08744e3730c600ebcd Mon Sep 17 00:00:00 2001 From: Marcel Coetzee Date: Thu, 18 May 2023 16:59:42 +0200 Subject: [PATCH 245/443] Set CUDA_VERSION as build ARG Signed-off-by: Marcel Coetzee --- Dockerfile.cuda | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile.cuda b/Dockerfile.cuda index d9451bc..e95fa23 100644 --- a/Dockerfile.cuda +++ b/Dockerfile.cuda @@ -1,4 +1,5 @@ -FROM nvidia/cuda:12.0.1-devel-ubuntu22.04 +ARG CUDA_VERSION=12.1.1 +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 # We need to set the host to 0.0.0.0 to allow outside access ENV HOST 0.0.0.0 From 21d8f5fa9f35ac2adc2181b0d748fba7ffe8dfb8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 18 May 2023 11:35:15 -0400 Subject: [PATCH 246/443] Remove unnused union --- llama_cpp/llama_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index bfc7342..e8f4ce1 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Dict, Union +from typing import List, Optional, Dict from typing_extensions import TypedDict, NotRequired, Literal From f0ec6e615ecae3d7b5343d78c9063003dfb71e6a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 18 May 2023 11:35:59 -0400 Subject: [PATCH 247/443] Stream tokens instead of text chunks --- llama_cpp/llama.py | 112 +++++++++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 34 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index f47f4a4..bf4caf7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -623,7 +623,7 @@ class Llama: b" " + prompt.encode("utf-8") ) text: bytes = b"" - returned_characters: int = 0 + returned_tokens: int = 0 stop = stop if stop is not None else [] model_name: str = model if model is not None else self.model_path @@ -707,33 +707,42 @@ class Llama: break if stream: - start = returned_characters - longest = 0 # We want to avoid yielding any characters from # the generated text if they are part of a stop # sequence. + longest = 0 for s in stop_sequences: for i in range(len(s), 0, -1): if all_text.endswith(s[:i]): if i > longest: longest = i break - text = all_text[: len(all_text) - longest] - returned_characters += len(text[start:]) - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": text[start:].decode("utf-8", errors="ignore"), - "index": 0, - "logprobs": None, - "finish_reason": None, - } - ], - } + + offset = 0 + remaining_tokens = completion_tokens[returned_tokens:] + remaining_length = len(self.detokenize(remaining_tokens)) + for token in remaining_tokens: + offset += len(self.detokenize([token])) + # Check if stop sequence is not in the token + if offset >= (remaining_length - longest - 1): + break + returned_tokens += 1 + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": self.detokenize([token]).decode( + "utf-8", errors="ignore" + ), + "index": 0, + "logprobs": None, + "finish_reason": None, + } + ], + } if len(completion_tokens) >= max_tokens: text = self.detokenize(completion_tokens) @@ -749,22 +758,57 @@ class Llama: llama_cpp.llama_print_timings(self.ctx) if stream: - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": text[returned_characters:].decode( - "utf-8", errors="ignore" - ), - "index": 0, - "logprobs": None, - "finish_reason": finish_reason, + remaining_tokens = completion_tokens[returned_tokens:] + all_text = self.detokenize(remaining_tokens) + any_stop = [s for s in stop_sequences if s in all_text] + if len(any_stop) > 0: + end = min(all_text.index(stop) for stop in any_stop) + else: + end = len(all_text) + + offset = 0 + for token in remaining_tokens: + offset += len(self.detokenize([token])) + if offset >= end: + last_text = self.detokenize([token]) + if offset == end - 1: + break + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": last_text[ + : len(last_text) - (offset - end) + ].decode("utf-8", errors="ignore"), + "index": 0, + "logprobs": None, + "finish_reason": finish_reason, + } + ], } - ], - } + break + returned_tokens += 1 + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": self.detokenize([token]).decode( + "utf-8", errors="ignore" + ), + "index": 0, + "logprobs": None, + "finish_reason": finish_reason + if returned_tokens == len(completion_tokens) + else None, + } + ], + } return text_str = text.decode("utf-8", errors="ignore") From dc39cc0fa410f8b46954ad507b705052947da6bc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 19 May 2023 02:04:30 -0400 Subject: [PATCH 248/443] Use server sent events function for streaming completion --- llama_cpp/server/app.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index e8f62e8..3f95bdd 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -259,8 +259,15 @@ def create_completion( ) ) if request.stream: + + async def server_sent_events( + chunks: Iterator[llama_cpp.CompletionChunk], + ): + for chunk in chunks: + yield dict(data=json.dumps(chunk)) + chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks # type: ignore - return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks) + return EventSourceResponse(server_sent_events(chunks)) completion: llama_cpp.Completion = completion_or_chunks # type: ignore return completion From a634a2453b0088b0fcdcae7f936ccf8ca7f1a04b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 19 May 2023 02:04:57 -0400 Subject: [PATCH 249/443] Allow first logprob token to be null to match openai api --- llama_cpp/llama_types.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index e8f4ce1..7729ced 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -22,9 +22,9 @@ class Embedding(TypedDict): class CompletionLogprobs(TypedDict): text_offset: List[int] - token_logprobs: List[float] + token_logprobs: List[Optional[float]] tokens: List[str] - top_logprobs: List[Dict[str, float]] + top_logprobs: List[Optional[Dict[str, float]]] class CompletionChoice(TypedDict): From 17d4271b04cd5e11816d6623d7a5ef640a6bfbe4 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 19 May 2023 02:20:27 -0400 Subject: [PATCH 250/443] Fix logprobs for completions and implement for streaming logprobs. --- llama_cpp/llama.py | 125 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 103 insertions(+), 22 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index bf4caf7..58c32e9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -710,22 +710,56 @@ class Llama: # We want to avoid yielding any characters from # the generated text if they are part of a stop # sequence. - longest = 0 + first_stop_position = 0 for s in stop_sequences: for i in range(len(s), 0, -1): if all_text.endswith(s[:i]): - if i > longest: - longest = i + if i > first_stop_position: + first_stop_position = i break - offset = 0 + token_end_position = 0 remaining_tokens = completion_tokens[returned_tokens:] remaining_length = len(self.detokenize(remaining_tokens)) for token in remaining_tokens: - offset += len(self.detokenize([token])) - # Check if stop sequence is not in the token - if offset >= (remaining_length - longest - 1): + token_end_position += len(self.detokenize([token])) + # Check if stop sequence is in the token + if token_end_position >= (remaining_length - first_stop_position - 1): break + logprobs_or_none: Optional[CompletionLogprobs] = None + if logprobs is not None: + token_str = self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + text_offset = len(prompt) + len( + self.detokenize(completion_tokens[:returned_tokens]) + ) + token_offset = len(prompt_tokens) + returned_tokens + logits = self.eval_logits[token_offset - 1] + current_logprobs = Llama.logits_to_logprobs(logits) + sorted_logprobs = list( + sorted( + zip(current_logprobs, range(len(current_logprobs))), + reverse=True, + ) + ) + top_logprob = { + self.detokenize([llama_cpp.llama_token(i)]).decode( + "utf-8", errors="ignore" + ): logprob + for logprob, i in sorted_logprobs[:logprobs] + } + top_logprob.update({token_str: current_logprobs[int(token)]}) + logprobs_or_none = { + "tokens": [ + self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + ], + "text_offset": [text_offset], + "token_logprobs": [sorted_logprobs[int(token)][0]], + "top_logprobs": [top_logprob], + } returned_tokens += 1 yield { "id": completion_id, @@ -738,7 +772,7 @@ class Llama: "utf-8", errors="ignore" ), "index": 0, - "logprobs": None, + "logprobs": logprobs_or_none, "finish_reason": None, } ], @@ -766,13 +800,48 @@ class Llama: else: end = len(all_text) - offset = 0 + token_end_position = 0 for token in remaining_tokens: - offset += len(self.detokenize([token])) - if offset >= end: + token_end_position += len(self.detokenize([token])) + + logprobs_or_none: Optional[CompletionLogprobs] = None + if logprobs is not None: + token_str = self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + text_offset = len(prompt) + len( + self.detokenize(completion_tokens[:returned_tokens]) + ) + token_offset = len(prompt_tokens) + returned_tokens - 1 + logits = self.eval_logits[token_offset] + current_logprobs = Llama.logits_to_logprobs(logits) + sorted_logprobs = list( + sorted( + zip(current_logprobs, range(len(current_logprobs))), + reverse=True, + ) + ) + top_logprob = { + self.detokenize([llama_cpp.llama_token(i)]).decode( + "utf-8", errors="ignore" + ): logprob + for logprob, i in sorted_logprobs[:logprobs] + } + top_logprob.update({token_str: current_logprobs[int(token)]}) + logprobs_or_none = { + "tokens": [ + self.detokenize([token]).decode("utf-8", errors="ignore") + ], + "text_offset": [text_offset], + "token_logprobs": [sorted_logprobs[int(token)][0]], + "top_logprobs": [top_logprob], + } + + if token_end_position >= end: last_text = self.detokenize([token]) - if offset == end - 1: + if token_end_position == end - 1: break + returned_tokens += 1 yield { "id": completion_id, "object": "text_completion", @@ -781,10 +850,10 @@ class Llama: "choices": [ { "text": last_text[ - : len(last_text) - (offset - end) + : len(last_text) - (token_end_position - end) ].decode("utf-8", errors="ignore"), "index": 0, - "logprobs": None, + "logprobs": logprobs_or_none, "finish_reason": finish_reason, } ], @@ -802,7 +871,7 @@ class Llama: "utf-8", errors="ignore" ), "index": 0, - "logprobs": None, + "logprobs": logprobs_or_none, "finish_reason": finish_reason if returned_tokens == len(completion_tokens) else None, @@ -821,13 +890,19 @@ class Llama: logprobs_or_none: Optional[CompletionLogprobs] = None if logprobs is not None: - text_offset = 0 + text_offset = 0 if echo else len(prompt) + token_offset = 0 if echo else len(prompt_tokens[1:]) text_offsets: List[int] = [] - token_logprobs: List[float] = [] + token_logprobs: List[Optional[float]] = [] tokens: List[str] = [] - top_logprobs: List[Dict[str, float]] = [] + top_logprobs: List[Optional[Dict[str, float]]] = [] + + if echo: + # Remove leading BOS token + all_tokens = prompt_tokens[1:] + completion_tokens + else: + all_tokens = completion_tokens - all_tokens = prompt_tokens + completion_tokens all_token_strs = [ self.detokenize([token]).decode("utf-8", errors="ignore") for token in all_tokens @@ -835,7 +910,7 @@ class Llama: all_logprobs = [ Llama.logits_to_logprobs(list(map(float, row))) for row in self.eval_logits - ] + ][token_offset:] for token, token_str, logprobs_token in zip( all_tokens, all_token_strs, all_logprobs ): @@ -848,14 +923,20 @@ class Llama: ) ) token_logprobs.append(sorted_logprobs[int(token)][0]) - top_logprob = { + top_logprob: Optional[Dict[str, float]] = { self.detokenize([llama_cpp.llama_token(i)]).decode( "utf-8", errors="ignore" ): logprob for logprob, i in sorted_logprobs[:logprobs] } - top_logprob.update({token_str: sorted_logprobs[int(token)][0]}) + top_logprob.update({token_str: logprobs_token[int(token)]}) top_logprobs.append(top_logprob) + # Weird idosincracy of the OpenAI API where + # token_logprobs and top_logprobs are null for + # the first token. + if echo and len(all_tokens) > 0: + token_logprobs[0] = None + top_logprobs[0] = None logprobs_or_none = { "tokens": tokens, "text_offset": text_offsets, From f0812c4d8c9985941286b8c99351663888ae8b12 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 19 May 2023 02:20:41 -0400 Subject: [PATCH 251/443] Add upgrade instructions to the README --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 17cc28c..4e442c1 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,12 @@ pip install llama-cpp-python The above command will attempt to install the package and build build `llama.cpp` from source. This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. +If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different compiler options, please add the following flags to ensure that the package is rebuilt correctly: + +```bash +pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir +``` + Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: ``` wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh From a8cd169251cf6c8bfef2bfc397ddb89c19f6d3d9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 19 May 2023 03:15:08 -0400 Subject: [PATCH 252/443] Bugfix: Stop sequences can be strings --- llama_cpp/llama.py | 12 ++++++------ llama_cpp/server/app.py | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 58c32e9..da5b0e3 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -602,7 +602,7 @@ class Llama: top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, - stop: Optional[List[str]] = [], + stop: Optional[Union[str, List[str]]] = [], frequency_penalty: float = 0.0, presence_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -624,7 +624,7 @@ class Llama: ) text: bytes = b"" returned_tokens: int = 0 - stop = stop if stop is not None else [] + stop = stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] model_name: str = model if model is not None else self.model_path if self.verbose: @@ -973,7 +973,7 @@ class Llama: top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, - stop: Optional[List[str]] = [], + stop: Optional[Union[str, List[str]]] = [], frequency_penalty: float = 0.0, presence_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -1042,7 +1042,7 @@ class Llama: top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, - stop: Optional[List[str]] = [], + stop: Optional[Union[str, List[str]]] = [], frequency_penalty: float = 0.0, presence_penalty: float = 0.0, repeat_penalty: float = 1.1, @@ -1162,7 +1162,7 @@ class Llama: top_p: float = 0.95, top_k: int = 40, stream: bool = False, - stop: Optional[List[str]] = [], + stop: Optional[Union[str, List[str]]] = [], max_tokens: int = 256, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, @@ -1188,7 +1188,7 @@ class Llama: Returns: Generated chat completion or a stream of chat completion chunks. """ - stop = stop if stop is not None else [] + stop = stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] chat_history = "".join( f'### {"Human" if message["role"] == "user" else "Assistant"}:{message["content"]}' for message in messages diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 3f95bdd..1ff0d1e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,4 +1,5 @@ import json +import logging import multiprocessing from threading import Lock from typing import List, Optional, Union, Iterator, Dict @@ -203,7 +204,7 @@ class CreateCompletionRequest(BaseModel): default=False, description="Whether to echo the prompt in the generated text. Useful for chatbots.", ) - stop: Optional[List[str]] = stop_field + stop: Optional[Union[str, List[str]]] = stop_field stream: bool = stream_field logprobs: Optional[int] = Field( default=None, From c7788c85ab9d093d46e470766f6cce54687abfe5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 19 May 2023 03:16:58 -0400 Subject: [PATCH 253/443] Add Guidance example --- examples/notebooks/Guidance.ipynb | 89 +++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 examples/notebooks/Guidance.ipynb diff --git a/examples/notebooks/Guidance.ipynb b/examples/notebooks/Guidance.ipynb new file mode 100644 index 0000000..045856e --- /dev/null +++ b/examples/notebooks/Guidance.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Stop program
Tweak this proverb to apply to model instructions instead.\n",
+       "\n",
+       "Where there is no guidance, a people falls,\n",
+       "but in an abundance of counselors there is safety.\n",
+       "- Proverbs 11:14\n",
+       "\n",
+       "UPDATED\n",
+       "Where there is no guidance for assembling a model, people will struggle,\n",
+       "but with clear instructions, the process becomes safe and successful.\n",
+       "- GPT 2 (updated): Proverbs 11:14
\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n", + "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n", + "os.environ[\"OPENAI_API_HOST\"] = \"http://100.64.159.73:8000\"\n", + "\n", + "import guidance\n", + "\n", + "# set the default language model used to execute guidance programs\n", + "guidance.llm = guidance.llms.OpenAI(\"text-davinci-003\", caching=False)\n", + "\n", + "# define a guidance program that adapts a proverb\n", + "program = guidance(\"\"\"Tweak this proverb to apply to model instructions instead.\n", + "\n", + "{{proverb}}\n", + "- {{book}} {{chapter}}:{{verse}}\n", + "\n", + "UPDATED\n", + "Where there is no guidance{{gen 'rewrite' stop=\"\\\\n-\"}}\n", + "- GPT {{gen 'chapter'}}:{{gen 'verse'}}\"\"\")\n", + "\n", + "# execute the program on a specific proverb\n", + "executed_program = program(\n", + " proverb=\"Where there is no guidance, a people falls,\\nbut in an abundance of counselors there is safety.\",\n", + " book=\"Proverbs\",\n", + " chapter=11,\n", + " verse=14\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From f82d85fbee65bc6c26d67bdd994e9ddb2755a472 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 19 May 2023 03:19:27 -0400 Subject: [PATCH 254/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6613ee0..7a52669 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.50" +version = "0.1.51" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index b056ce4..ef6012e 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.50", + version="0.1.51", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 01a010be521c076f851789ad56bec82284fdf96e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 19 May 2023 11:59:33 -0400 Subject: [PATCH 255/443] Fix llama_cpp and Llama type signatures. Closes #221 --- llama_cpp/llama.py | 76 +++++++++++++++++++----------------------- llama_cpp/llama_cpp.py | 42 +++++++++++------------ tests/test_llama.py | 4 +-- 3 files changed, 58 insertions(+), 64 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index da5b0e3..564c6c3 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -15,9 +15,7 @@ class LlamaCache: """Cache for a llama.cpp model.""" def __init__(self, capacity_bytes: int = (2 << 30)): - self.cache_state: OrderedDict[ - Tuple[llama_cpp.llama_token, ...], "LlamaState" - ] = OrderedDict() + self.cache_state: OrderedDict[Tuple[int, ...], "LlamaState"] = OrderedDict() self.capacity_bytes = capacity_bytes @property @@ -26,8 +24,8 @@ class LlamaCache: def _find_longest_prefix_key( self, - key: Tuple[llama_cpp.llama_token, ...], - ) -> Optional[Tuple[llama_cpp.llama_token, ...]]: + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: min_len = 0 min_key = None keys = ( @@ -39,7 +37,7 @@ class LlamaCache: min_key = k return min_key - def __getitem__(self, key: Sequence[llama_cpp.llama_token]) -> "LlamaState": + def __getitem__(self, key: Sequence[int]) -> "LlamaState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: @@ -48,10 +46,10 @@ class LlamaCache: self.cache_state.move_to_end(_key) return value - def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool: + def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[llama_cpp.llama_token], value: "LlamaState"): + def __setitem__(self, key: Sequence[int], value: "LlamaState"): key = tuple(key) if key in self.cache_state: del self.cache_state[key] @@ -63,7 +61,7 @@ class LlamaCache: class LlamaState: def __init__( self, - eval_tokens: Deque[llama_cpp.llama_token], + eval_tokens: Deque[int], eval_logits: Deque[List[float]], llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] llama_state_size: int, @@ -141,7 +139,7 @@ class Llama: self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) - self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx) + self.eval_tokens: Deque[int] = deque(maxlen=n_ctx) self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx if logits_all else 1) self.cache: Optional[LlamaCache] = None @@ -176,9 +174,7 @@ class Llama: if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) - def tokenize( - self, text: bytes, add_bos: bool = True - ) -> List[llama_cpp.llama_token]: + def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: """Tokenize a string. Args: @@ -197,7 +193,7 @@ class Llama: self.ctx, text, tokens, - n_ctx, + llama_cpp.c_int(n_ctx), llama_cpp.c_bool(add_bos), ) if int(n_tokens) < 0: @@ -216,7 +212,7 @@ class Llama: ) return list(tokens[:n_tokens]) - def detokenize(self, tokens: List[llama_cpp.llama_token]) -> bytes: + def detokenize(self, tokens: List[int]) -> bytes: """Detokenize a list of tokens. Args: @@ -228,7 +224,9 @@ class Llama: assert self.ctx is not None output = b"" for token in tokens: - output += llama_cpp.llama_token_to_str(self.ctx, token) + output += llama_cpp.llama_token_to_str( + self.ctx, llama_cpp.llama_token(token) + ) return output def set_cache(self, cache: Optional[LlamaCache]): @@ -244,7 +242,7 @@ class Llama: self.eval_tokens.clear() self.eval_logits.clear() - def eval(self, tokens: Sequence[llama_cpp.llama_token]): + def eval(self, tokens: Sequence[int]): """Evaluate a list of tokens. Args: @@ -458,7 +456,7 @@ class Llama: def generate( self, - tokens: Sequence[llama_cpp.llama_token], + tokens: Sequence[int], top_k: int = 40, top_p: float = 0.95, temp: float = 0.80, @@ -470,9 +468,7 @@ class Llama: mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, - ) -> Generator[ - llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None - ]: + ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. Examples: @@ -617,14 +613,14 @@ class Llama: assert self.ctx is not None completion_id: str = f"cmpl-{str(uuid.uuid4())}" created: int = int(time.time()) - completion_tokens: List[llama_cpp.llama_token] = [] + completion_tokens: List[int] = [] # Add blank space to start of prompt to match OG llama tokenizer - prompt_tokens: List[llama_cpp.llama_token] = self.tokenize( - b" " + prompt.encode("utf-8") - ) + prompt_tokens: List[int] = self.tokenize(b" " + prompt.encode("utf-8")) text: bytes = b"" returned_tokens: int = 0 - stop = stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] + stop = ( + stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] + ) model_name: str = model if model is not None else self.model_path if self.verbose: @@ -724,7 +720,9 @@ class Llama: for token in remaining_tokens: token_end_position += len(self.detokenize([token])) # Check if stop sequence is in the token - if token_end_position >= (remaining_length - first_stop_position - 1): + if token_end_position >= ( + remaining_length - first_stop_position - 1 + ): break logprobs_or_none: Optional[CompletionLogprobs] = None if logprobs is not None: @@ -744,7 +742,7 @@ class Llama: ) ) top_logprob = { - self.detokenize([llama_cpp.llama_token(i)]).decode( + self.detokenize([i]).decode( "utf-8", errors="ignore" ): logprob for logprob, i in sorted_logprobs[:logprobs] @@ -822,9 +820,7 @@ class Llama: ) ) top_logprob = { - self.detokenize([llama_cpp.llama_token(i)]).decode( - "utf-8", errors="ignore" - ): logprob + self.detokenize([i]).decode("utf-8", errors="ignore"): logprob for logprob, i in sorted_logprobs[:logprobs] } top_logprob.update({token_str: current_logprobs[int(token)]}) @@ -924,9 +920,7 @@ class Llama: ) token_logprobs.append(sorted_logprobs[int(token)][0]) top_logprob: Optional[Dict[str, float]] = { - self.detokenize([llama_cpp.llama_token(i)]).decode( - "utf-8", errors="ignore" - ): logprob + self.detokenize([i]).decode("utf-8", errors="ignore"): logprob for logprob, i in sorted_logprobs[:logprobs] } top_logprob.update({token_str: logprobs_token[int(token)]}) @@ -1188,7 +1182,9 @@ class Llama: Returns: Generated chat completion or a stream of chat completion chunks. """ - stop = stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] + stop = ( + stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] + ) chat_history = "".join( f'### {"Human" if message["role"] == "user" else "Assistant"}:{message["content"]}' for message in messages @@ -1296,17 +1292,17 @@ class Llama: raise RuntimeError("Failed to set llama state data") @staticmethod - def token_eos() -> llama_cpp.llama_token: + def token_eos() -> int: """Return the end-of-sequence token.""" return llama_cpp.llama_token_eos() @staticmethod - def token_bos() -> llama_cpp.llama_token: + def token_bos() -> int: """Return the beginning-of-sequence token.""" return llama_cpp.llama_token_bos() @staticmethod - def token_nl() -> llama_cpp.llama_token: + def token_nl() -> int: """Return the newline token.""" return llama_cpp.llama_token_nl() @@ -1317,9 +1313,7 @@ class Llama: return [math.log(x / sum_exps) for x in exps] @staticmethod - def longest_token_prefix( - a: Sequence[llama_cpp.llama_token], b: Sequence[llama_cpp.llama_token] - ): + def longest_token_prefix(a: Sequence[int], b: Sequence[int]): longest_prefix = 0 for _a, _b in zip(a, b): if _a == _b: diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 24ab40a..0dcb16c 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -44,13 +44,13 @@ def _load_shared_library(lib_base_name: str): _base_path = _lib.parent.resolve() _lib_paths = [_lib.resolve()] - cdll_args = dict() # type: ignore + cdll_args = dict() # type: ignore # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) if "CUDA_PATH" in os.environ: - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"],"bin")) - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"],"lib")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) cdll_args["winmode"] = 0 # Try to load the shared library, handling potential errors @@ -194,7 +194,7 @@ _lib.llama_init_from_file.restype = llama_context_p # Frees all allocated memory def llama_free(ctx: llama_context_p): - _lib.llama_free(ctx) + return _lib.llama_free(ctx) _lib.llama_free.argtypes = [llama_context_p] @@ -206,7 +206,7 @@ _lib.llama_free.restype = None # nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given def llama_model_quantize( fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int -) -> c_int: +) -> int: return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread) @@ -225,7 +225,7 @@ def llama_apply_lora_from_file( path_lora: c_char_p, path_base_model: c_char_p, n_threads: c_int, -) -> c_int: +) -> int: return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) @@ -234,7 +234,7 @@ _lib.llama_apply_lora_from_file.restype = c_int # Returns the number of tokens in the KV cache -def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: +def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int: return _lib.llama_get_kv_cache_token_count(ctx) @@ -253,7 +253,7 @@ _lib.llama_set_rng_seed.restype = None # Returns the maximum size in bytes of the state (rng, logits, embedding # and kv_cache) - will often be smaller after compacting tokens -def llama_get_state_size(ctx: llama_context_p) -> c_size_t: +def llama_get_state_size(ctx: llama_context_p) -> int: return _lib.llama_get_state_size(ctx) @@ -293,7 +293,7 @@ def llama_load_session_file( tokens_out, # type: Array[llama_token] n_token_capacity: c_size_t, n_token_count_out, # type: _Pointer[c_size_t] -) -> c_size_t: +) -> int: return _lib.llama_load_session_file( ctx, path_session, tokens_out, n_token_capacity, n_token_count_out ) @@ -314,7 +314,7 @@ def llama_save_session_file( path_session: bytes, tokens, # type: Array[llama_token] n_token_count: c_size_t, -) -> c_size_t: +) -> int: return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) @@ -337,7 +337,7 @@ def llama_eval( n_tokens: c_int, n_past: c_int, n_threads: c_int, -) -> c_int: +) -> int: return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads) @@ -364,7 +364,7 @@ _lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, _lib.llama_tokenize.restype = c_int -def llama_n_vocab(ctx: llama_context_p) -> c_int: +def llama_n_vocab(ctx: llama_context_p) -> int: return _lib.llama_n_vocab(ctx) @@ -372,7 +372,7 @@ _lib.llama_n_vocab.argtypes = [llama_context_p] _lib.llama_n_vocab.restype = c_int -def llama_n_ctx(ctx: llama_context_p) -> c_int: +def llama_n_ctx(ctx: llama_context_p) -> int: return _lib.llama_n_ctx(ctx) @@ -380,7 +380,7 @@ _lib.llama_n_ctx.argtypes = [llama_context_p] _lib.llama_n_ctx.restype = c_int -def llama_n_embd(ctx: llama_context_p) -> c_int: +def llama_n_embd(ctx: llama_context_p) -> int: return _lib.llama_n_embd(ctx) @@ -426,7 +426,7 @@ _lib.llama_token_to_str.restype = c_char_p # Special tokens -def llama_token_bos() -> llama_token: +def llama_token_bos() -> int: return _lib.llama_token_bos() @@ -434,7 +434,7 @@ _lib.llama_token_bos.argtypes = [] _lib.llama_token_bos.restype = llama_token -def llama_token_eos() -> llama_token: +def llama_token_eos() -> int: return _lib.llama_token_eos() @@ -442,7 +442,7 @@ _lib.llama_token_eos.argtypes = [] _lib.llama_token_eos.restype = llama_token -def llama_token_nl() -> llama_token: +def llama_token_nl() -> int: return _lib.llama_token_nl() @@ -625,7 +625,7 @@ def llama_sample_token_mirostat( eta: c_float, m: c_int, mu, # type: _Pointer[c_float] -) -> llama_token: +) -> int: return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -651,7 +651,7 @@ def llama_sample_token_mirostat_v2( tau: c_float, eta: c_float, mu, # type: _Pointer[c_float] -) -> llama_token: +) -> int: return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) @@ -669,7 +669,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token def llama_sample_token_greedy( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] -) -> llama_token: +) -> int: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -684,7 +684,7 @@ _lib.llama_sample_token_greedy.restype = llama_token def llama_sample_token( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] -) -> llama_token: +) -> int: return _lib.llama_sample_token(ctx, candidates) diff --git a/tests/test_llama.py b/tests/test_llama.py index b3426b8..941287d 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -17,7 +17,7 @@ def test_llama(): # @pytest.mark.skip(reason="need to update sample mocking") def test_llama_patch(monkeypatch): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) - n_vocab = int(llama_cpp.llama_n_vocab(llama.ctx)) + n_vocab = llama_cpp.llama_n_vocab(llama.ctx) ## Set up mock function def mock_eval(*args, **kwargs): @@ -107,7 +107,7 @@ def test_llama_pickle(): def test_utf8(monkeypatch): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) - n_vocab = int(llama_cpp.llama_n_vocab(llama.ctx)) + n_vocab = llama_cpp.llama_n_vocab(llama.ctx) ## Set up mock function def mock_eval(*args, **kwargs): From e783f1c191ff66b45fba4ed7bd2821703952ca62 Mon Sep 17 00:00:00 2001 From: Simon Chabot Date: Sat, 20 May 2023 01:23:32 +0200 Subject: [PATCH 256/443] feat: make embedding support list of string as input makes the /v1/embedding route similar to OpenAI api. --- llama_cpp/llama.py | 46 ++++++++++++++++++++++++++--------------- llama_cpp/server/app.py | 2 +- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 564c6c3..e854674 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -531,7 +531,9 @@ class Llama: if tokens_or_none is not None: tokens.extend(tokens_or_none) - def create_embedding(self, input: str, model: Optional[str] = None) -> Embedding: + def create_embedding( + self, input: Union[str, List[str]], model: Optional[str] = None + ) -> Embedding: """Embed a string. Args: @@ -551,30 +553,40 @@ class Llama: if self.verbose: llama_cpp.llama_reset_timings(self.ctx) - tokens = self.tokenize(input.encode("utf-8")) - self.reset() - self.eval(tokens) - n_tokens = len(tokens) - embedding = llama_cpp.llama_get_embeddings(self.ctx)[ - : llama_cpp.llama_n_embd(self.ctx) - ] + if isinstance(input, str): + inputs = [input] + else: + inputs = input - if self.verbose: - llama_cpp.llama_print_timings(self.ctx) + data = [] + total_tokens = 0 + for input in inputs: + tokens = self.tokenize(input.encode("utf-8")) + self.reset() + self.eval(tokens) + n_tokens = len(tokens) + total_tokens += n_tokens + embedding = llama_cpp.llama_get_embeddings(self.ctx)[ + : llama_cpp.llama_n_embd(self.ctx) + ] - return { - "object": "list", - "data": [ + if self.verbose: + llama_cpp.llama_print_timings(self.ctx) + data.append( { "object": "embedding", "embedding": embedding, "index": 0, } - ], - "model": model_name, + ) + + return { + "object": "list", + "data": data, + "model": self.model_path, "usage": { - "prompt_tokens": n_tokens, - "total_tokens": n_tokens, + "prompt_tokens": total_tokens, + "total_tokens": total_tokens, }, } diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 1ff0d1e..fea3612 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -275,7 +275,7 @@ def create_completion( class CreateEmbeddingRequest(BaseModel): model: Optional[str] = model_field - input: str = Field(description="The input to embed.") + input: Union[str, List[str]] = Field(description="The input to embed.") user: Optional[str] class Config: From 0b079a658c5772f0c0004245f9d6564b5ab4b4af Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 20 May 2023 02:25:59 +0100 Subject: [PATCH 257/443] make git module accessible anonymously --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 6267b09..7edf097 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = git@github.com:ggerganov/llama.cpp.git + url = https://github.com/ggerganov/llama.cpp.git From a7ba85834ffef7de128d5fda5a1d29f8f001b049 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 20 May 2023 08:13:41 -0400 Subject: [PATCH 258/443] Add n_ctx, n_vocab, and n_embd properties --- llama_cpp/llama.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 564c6c3..e399028 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1291,6 +1291,24 @@ class Llama: if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size: raise RuntimeError("Failed to set llama state data") + @property + def n_ctx(self) -> int: + """Return the context window size.""" + assert self.ctx is not None + return llama_cpp.llama_n_ctx(self.ctx) + + @property + def n_embd(self) -> int: + """Return the embedding size.""" + assert self.ctx is not None + return llama_cpp.llama_n_embd(self.ctx) + + @property + def n_vocab(self) -> int: + """Return the vocabulary size.""" + assert self.ctx is not None + return llama_cpp.llama_n_vocab(self.ctx) + @staticmethod def token_eos() -> int: """Return the end-of-sequence token.""" From 76b1d2cd20d68795a3bd9768591ad8c1f249ccff Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 20 May 2023 08:24:06 -0400 Subject: [PATCH 259/443] Change properties to functions to match token functions --- llama_cpp/llama.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index e399028..6ccb823 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1291,19 +1291,16 @@ class Llama: if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size: raise RuntimeError("Failed to set llama state data") - @property def n_ctx(self) -> int: """Return the context window size.""" assert self.ctx is not None return llama_cpp.llama_n_ctx(self.ctx) - @property def n_embd(self) -> int: """Return the embedding size.""" assert self.ctx is not None return llama_cpp.llama_n_embd(self.ctx) - @property def n_vocab(self) -> int: """Return the vocabulary size.""" assert self.ctx is not None From 8f49ca0287cab51a867bb636dc5638e003b80bfd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 20 May 2023 08:53:40 -0400 Subject: [PATCH 260/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7a52669..3a09bc0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.51" +version = "0.1.52" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index ef6012e..bcd01bf 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.51", + version="0.1.52", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From fafe47114c22a9a1b316a81555e43a79bea2ede9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 21 May 2023 17:47:21 -0400 Subject: [PATCH 261/443] Update llama.cpp --- llama_cpp/llama.py | 9 +- llama_cpp/llama_cpp.py | 219 +++++++++++++++++++++++++++++++++-------- vendor/llama.cpp | 2 +- 3 files changed, 186 insertions(+), 44 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6ccb823..332a882 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -127,7 +127,6 @@ class Llama: self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx - self.params.n_parts = n_parts self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv @@ -149,6 +148,10 @@ class Llama: self.lora_base = lora_base self.lora_path = lora_path + ### DEPRECATED ### + self.n_parts = n_parts + ### DEPRECATED ### + if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") @@ -1225,7 +1228,6 @@ class Llama: verbose=self.verbose, model_path=self.model_path, n_ctx=self.params.n_ctx, - n_parts=self.params.n_parts, n_gpu_layers=self.params.n_gpu_layers, seed=self.params.seed, f16_kv=self.params.f16_kv, @@ -1239,6 +1241,9 @@ class Llama: n_threads=self.n_threads, lora_base=self.lora_base, lora_path=self.lora_path, + ### DEPRECATED ### + n_parts=self.n_parts, + ### DEPRECATED ### ) def __setstate__(self, state): diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 0dcb16c..541ee00 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -72,31 +72,61 @@ _lib_base_name = "llama" # Load the library _lib = _load_shared_library(_lib_base_name) -# C types -LLAMA_FILE_VERSION = c_int(2) -LLAMA_FILE_MAGIC = b"ggjt" -LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" -LLAMA_SESSION_MAGIC = b"ggsn" +# Misc +c_float_p = POINTER(c_float) +c_uint8_p = POINTER(c_uint8) +c_size_t_p = POINTER(c_size_t) + +# llama.h bindings + +# #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) +# #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) +# #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) +# #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) +# #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' +LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) + +# #define LLAMA_FILE_VERSION 3 +LLAMA_FILE_VERSION = c_int(3) +LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT +LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML +LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_VERSION = c_int(1) +# struct llama_context; llama_context_p = c_void_p +# typedef int llama_token; llama_token = c_int llama_token_p = POINTER(llama_token) +# typedef struct llama_token_data { +# llama_token id; // token id +# float logit; // log-odds of the token +# float p; // probability of the token +# } llama_token_data; class llama_token_data(Structure): _fields_ = [ - ("id", llama_token), # token id - ("logit", c_float), # log-odds of the token - ("p", c_float), # probability of the token + ("id", llama_token), + ("logit", c_float), + ("p", c_float), ] llama_token_data_p = POINTER(llama_token_data) +# typedef struct llama_token_data_array { +# llama_token_data * data; +# size_t size; +# bool sorted; +# } llama_token_data_array; class llama_token_data_array(Structure): _fields_ = [ ("data", llama_token_data_p), @@ -107,54 +137,72 @@ class llama_token_data_array(Structure): llama_token_data_array_p = POINTER(llama_token_data_array) +# typedef void (*llama_progress_callback)(float progress, void *ctx); llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) +# struct llama_context_params { +# int n_ctx; // text context +# int n_gpu_layers; // number of layers to store in VRAM +# int seed; // RNG seed, -1 for random + +# bool f16_kv; // use fp16 for KV cache +# bool logits_all; // the llama_eval() call computes all logits, not just the last one +# bool vocab_only; // only load the vocabulary, no weights +# bool use_mmap; // use mmap if possible +# bool use_mlock; // force system to keep model in RAM +# bool embedding; // embedding mode only + + +# // called with a progress value between 0 and 1, pass NULL to disable +# llama_progress_callback progress_callback; +# // context pointer passed to the progress callback +# void * progress_callback_user_data; +# }; class llama_context_params(Structure): _fields_ = [ - ("n_ctx", c_int), # text context - ("n_parts", c_int), # -1 for default - ("n_gpu_layers", c_int), # number of layers to store in VRAM - ("seed", c_int), # RNG seed, 0 for random - ("f16_kv", c_bool), # use fp16 for KV cache + ("n_ctx", c_int), + ("n_gpu_layers", c_int), + ("seed", c_int), + ("f16_kv", c_bool), ( "logits_all", c_bool, - ), # the llama_eval() call computes all logits, not just the last one - ("vocab_only", c_bool), # only load the vocabulary, no weights - ("use_mmap", c_bool), # use mmap if possible - ("use_mlock", c_bool), # force system to keep model in RAM - ("embedding", c_bool), # embedding mode only - # called with a progress value between 0 and 1, pass NULL to disable + ), + ("vocab_only", c_bool), + ("use_mmap", c_bool), + ("use_mlock", c_bool), + ("embedding", c_bool), ("progress_callback", llama_progress_callback), - # context pointer passed to the progress callback ("progress_callback_user_data", c_void_p), ] llama_context_params_p = POINTER(llama_context_params) +# enum llama_ftype { +# LLAMA_FTYPE_ALL_F32 = 0, +# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 +# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed +# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed +# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors +# }; LLAMA_FTYPE_ALL_F32 = c_int(0) -LLAMA_FTYPE_MOSTLY_F16 = c_int(1) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( - 4 -) # tok_embeddings.weight and output.weight are F16 -# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors -# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) # except 1d tensors - -# Misc -c_float_p = POINTER(c_float) -c_uint8_p = POINTER(c_uint8) -c_size_t_p = POINTER(c_size_t) - -# Functions +LLAMA_FTYPE_MOSTLY_F16 = c_int(1) +LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) +LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) +LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) +LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) +LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) +# LLAMA_API struct llama_context_params llama_context_default_params(); def llama_context_default_params() -> llama_context_params: return _lib.llama_context_default_params() @@ -163,6 +211,7 @@ _lib.llama_context_default_params.argtypes = [] _lib.llama_context_default_params.restype = llama_context_params +# LLAMA_API bool llama_mmap_supported(); def llama_mmap_supported() -> bool: return _lib.llama_mmap_supported() @@ -171,6 +220,7 @@ _lib.llama_mmap_supported.argtypes = [] _lib.llama_mmap_supported.restype = c_bool +# LLAMA_API bool llama_mlock_supported(); def llama_mlock_supported() -> bool: return _lib.llama_mlock_supported() @@ -179,9 +229,33 @@ _lib.llama_mlock_supported.argtypes = [] _lib.llama_mlock_supported.restype = c_bool -# Various functions for loading a ggml llama model. -# Allocate (almost) all memory needed for the model. -# Return NULL on failure +# // TODO: not great API - very likely to change +# // Initialize the llama + ggml backend +# // Call once at the start of the program +# LLAMA_API void llama_init_backend(); +def llama_init_backend(): + return _lib.llama_init_backend() + + +_lib.llama_init_backend.argtypes = [] +_lib.llama_init_backend.restype = None + + +# LLAMA_API int64_t llama_time_us(); +def llama_time_us() -> int: + return _lib.llama_time_us() + + +_lib.llama_time_us.argtypes = [] +_lib.llama_time_us.restype = ctypes.c_int64 + + +# // Various functions for loading a ggml llama model. +# // Allocate (almost) all memory needed for the model. +# // Return NULL on failure +# LLAMA_API struct llama_context * llama_init_from_file( +# const char * path_model, +# struct llama_context_params params); def llama_init_from_file( path_model: bytes, params: llama_context_params ) -> llama_context_p: @@ -193,6 +267,7 @@ _lib.llama_init_from_file.restype = llama_context_p # Frees all allocated memory +# LLAMA_API void llama_free(struct llama_context * ctx); def llama_free(ctx: llama_context_p): return _lib.llama_free(ctx) @@ -204,6 +279,11 @@ _lib.llama_free.restype = None # TODO: not great API - very likely to change # Returns 0 on success # nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given +# LLAMA_API int llama_model_quantize( +# const char * fname_inp, +# const char * fname_out, +# enum llama_ftype ftype, +# int nthread); def llama_model_quantize( fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int ) -> int: @@ -220,6 +300,11 @@ _lib.llama_model_quantize.restype = c_int # The model needs to be reloaded before applying a new adapter, otherwise the adapter # will be applied on top of the previous one # Returns 0 on success +# LLAMA_API int llama_apply_lora_from_file( +# struct llama_context * ctx, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); def llama_apply_lora_from_file( ctx: llama_context_p, path_lora: c_char_p, @@ -234,6 +319,7 @@ _lib.llama_apply_lora_from_file.restype = c_int # Returns the number of tokens in the KV cache +# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int: return _lib.llama_get_kv_cache_token_count(ctx) @@ -243,6 +329,7 @@ _lib.llama_get_kv_cache_token_count.restype = c_int # Sets the current rng seed. +# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): return _lib.llama_set_rng_seed(ctx, seed) @@ -253,6 +340,7 @@ _lib.llama_set_rng_seed.restype = None # Returns the maximum size in bytes of the state (rng, logits, embedding # and kv_cache) - will often be smaller after compacting tokens +# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); def llama_get_state_size(ctx: llama_context_p) -> int: return _lib.llama_get_state_size(ctx) @@ -264,6 +352,7 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied +# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); def llama_copy_state_data( ctx: llama_context_p, dst # type: Array[c_uint8] ) -> int: @@ -276,6 +365,7 @@ _lib.llama_copy_state_data.restype = c_size_t # Set the state reading from the specified address # Returns the number of bytes read +# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); def llama_set_state_data( ctx: llama_context_p, src # type: Array[c_uint8] ) -> int: @@ -287,6 +377,7 @@ _lib.llama_set_state_data.restype = c_size_t # Save/load session file +# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); def llama_load_session_file( ctx: llama_context_p, path_session: bytes, @@ -309,6 +400,7 @@ _lib.llama_load_session_file.argtypes = [ _lib.llama_load_session_file.restype = c_size_t +# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); def llama_save_session_file( ctx: llama_context_p, path_session: bytes, @@ -331,6 +423,12 @@ _lib.llama_save_session_file.restype = c_size_t # tokens + n_tokens is the provided batch of new tokens to process # n_past is the number of tokens to use from previous eval calls # Returns 0 on success +# LLAMA_API int llama_eval( +# struct llama_context * ctx, +# const llama_token * tokens, +# int n_tokens, +# int n_past, +# int n_threads); def llama_eval( ctx: llama_context_p, tokens, # type: Array[llama_token] @@ -350,6 +448,12 @@ _lib.llama_eval.restype = c_int # Returns the number of tokens on success, no more than n_max_tokens # Returns a negative number on failure - the number of tokens that would have been returned # TODO: not sure if correct +# LLAMA_API int llama_tokenize( +# struct llama_context * ctx, +# const char * text, +# llama_token * tokens, +# int n_max_tokens, +# bool add_bos); def llama_tokenize( ctx: llama_context_p, text: bytes, @@ -364,6 +468,7 @@ _lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, _lib.llama_tokenize.restype = c_int +# LLAMA_API int llama_n_vocab(const struct llama_context * ctx); def llama_n_vocab(ctx: llama_context_p) -> int: return _lib.llama_n_vocab(ctx) @@ -372,6 +477,7 @@ _lib.llama_n_vocab.argtypes = [llama_context_p] _lib.llama_n_vocab.restype = c_int +# LLAMA_API int llama_n_ctx (const struct llama_context * ctx); def llama_n_ctx(ctx: llama_context_p) -> int: return _lib.llama_n_ctx(ctx) @@ -380,6 +486,7 @@ _lib.llama_n_ctx.argtypes = [llama_context_p] _lib.llama_n_ctx.restype = c_int +# LLAMA_API int llama_n_embd (const struct llama_context * ctx); def llama_n_embd(ctx: llama_context_p) -> int: return _lib.llama_n_embd(ctx) @@ -393,6 +500,7 @@ _lib.llama_n_embd.restype = c_int # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab +# LLAMA_API float * llama_get_logits(struct llama_context * ctx); def llama_get_logits( ctx: llama_context_p, ): # type: (...) -> Array[float] # type: ignore @@ -405,6 +513,7 @@ _lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input # shape: [n_embd] (1-dimensional) +# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); def llama_get_embeddings( ctx: llama_context_p, ): # type: (...) -> Array[float] # type: ignore @@ -416,6 +525,7 @@ _lib.llama_get_embeddings.restype = c_float_p # Token Id -> String. Uses the vocabulary in the provided context +# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: return _lib.llama_token_to_str(ctx, token) @@ -426,6 +536,7 @@ _lib.llama_token_to_str.restype = c_char_p # Special tokens +# LLAMA_API llama_token llama_token_bos(); def llama_token_bos() -> int: return _lib.llama_token_bos() @@ -434,6 +545,7 @@ _lib.llama_token_bos.argtypes = [] _lib.llama_token_bos.restype = llama_token +# LLAMA_API llama_token llama_token_eos(); def llama_token_eos() -> int: return _lib.llama_token_eos() @@ -442,6 +554,7 @@ _lib.llama_token_eos.argtypes = [] _lib.llama_token_eos.restype = llama_token +# LLAMA_API llama_token llama_token_nl(); def llama_token_nl() -> int: return _lib.llama_token_nl() @@ -454,6 +567,7 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. +# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); def llama_sample_repetition_penalty( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -477,6 +591,7 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. +# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -507,6 +622,7 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. +# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_softmax( ctx: llama_context_p, candidates # type: _Pointer[llama_token_data] ): @@ -521,6 +637,7 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); def llama_sample_top_k( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -540,6 +657,7 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); def llama_sample_top_p( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -559,6 +677,7 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. +# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); def llama_sample_tail_free( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -578,6 +697,7 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. +# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); def llama_sample_typical( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -596,6 +716,7 @@ _lib.llama_sample_typical.argtypes = [ _lib.llama_sample_typical.restype = None +# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); def llama_sample_temperature( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -618,6 +739,7 @@ _lib.llama_sample_temperature.restype = None # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); def llama_sample_token_mirostat( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -645,6 +767,7 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); def llama_sample_token_mirostat_v2( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -666,6 +789,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. +# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_token_greedy( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -681,6 +805,7 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. +# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_token( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -698,6 +823,7 @@ _lib.llama_sample_token.restype = llama_token # Performance information +# LLAMA_API void llama_print_timings(struct llama_context * ctx); def llama_print_timings(ctx: llama_context_p): _lib.llama_print_timings(ctx) @@ -706,6 +832,7 @@ _lib.llama_print_timings.argtypes = [llama_context_p] _lib.llama_print_timings.restype = None +# LLAMA_API void llama_reset_timings(struct llama_context * ctx); def llama_reset_timings(ctx: llama_context_p): _lib.llama_reset_timings(ctx) @@ -715,9 +842,19 @@ _lib.llama_reset_timings.restype = None # Print system information +# LLAMA_API const char * llama_print_system_info(void); def llama_print_system_info() -> bytes: return _lib.llama_print_system_info() _lib.llama_print_system_info.argtypes = [] _lib.llama_print_system_info.restype = c_char_p + +################################################################################################### + + +_llama_initialized = False + +if not _llama_initialized: + llama_init_backend() + _llama_initialized = True diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c238b58..7e4ea5b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c238b5873a1ea496db03ffcfe124c9d0d83afbc6 +Subproject commit 7e4ea5beff567f53be92f75f9089e6f11fa5dabd From 03e2947b03beb3954e128783b401ced6143c275d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 21 May 2023 18:36:34 -0400 Subject: [PATCH 262/443] Fix unnecessary memory allocation while sampling --- llama_cpp/llama.py | 54 ++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 332a882..7943084 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -176,6 +176,28 @@ class Llama: if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + + + n_vocab = self.n_vocab() + n_ctx = self.n_ctx() + data = (llama_cpp.llama_token_data * n_vocab)( + *[ + llama_cpp.llama_token_data( + id=llama_cpp.llama_token(i), + logit=llama_cpp.c_float(0.0), + p=llama_cpp.c_float(0.0), + ) + for i in range(n_vocab) + ] + ) + size = llama_cpp.c_size_t(n_vocab) + sorted = False + candidates = llama_cpp.llama_token_data_array( + data=data, + size=size, + sorted=sorted, + ) + self._candidates = candidates def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: """Tokenize a string. @@ -296,8 +318,8 @@ class Llama: ): assert self.ctx is not None assert len(self.eval_logits) > 0 - n_vocab = int(llama_cpp.llama_n_vocab(self.ctx)) - n_ctx = int(llama_cpp.llama_n_ctx(self.ctx)) + n_vocab = self.n_vocab() + n_ctx = self.n_ctx() top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k last_n_tokens_size = ( llama_cpp.c_int(n_ctx) @@ -305,24 +327,14 @@ class Llama: else last_n_tokens_size ) logits = self.eval_logits[-1] - nl_logit = logits[int(Llama.token_nl())] - data = (llama_cpp.llama_token_data * n_vocab)( - *[ - llama_cpp.llama_token_data( - id=llama_cpp.llama_token(i), - logit=logits[i], - p=llama_cpp.c_float(0.0), - ) - for i in range(n_vocab) - ] - ) - size = llama_cpp.c_size_t(n_vocab) - sorted = False - candidates = llama_cpp.llama_token_data_array( - data=data, - size=size, - sorted=sorted, - ) + nl_logit = logits[Llama.token_nl()] + candidates = self._candidates + for i, logit in enumerate(logits): + candidates.data[i].id = llama_cpp.llama_token(i) + candidates.data[i].logit = llama_cpp.c_float(logit) + candidates.data[i].p = llama_cpp.c_float(0.0) + candidates.sorted = llama_cpp.c_bool(False) + candidates.size = llama_cpp.c_size_t(n_vocab) llama_cpp.llama_sample_repetition_penalty( ctx=self.ctx, last_tokens_data=last_n_tokens_data, @@ -339,7 +351,7 @@ class Llama: alpha_presence=presence_penalty, ) if not penalize_nl: - candidates.data[int(Llama.token_nl())].logit = nl_logit + candidates.data[Llama.token_nl()].logit = nl_logit if temp.value == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, From b895511ccae4f166c893d0a69c40643943640772 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 21 May 2023 18:38:06 -0400 Subject: [PATCH 263/443] Fix penalize_nl --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7943084..2d405b7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -351,7 +351,7 @@ class Llama: alpha_presence=presence_penalty, ) if not penalize_nl: - candidates.data[Llama.token_nl()].logit = nl_logit + candidates.data[Llama.token_nl()].logit = llama_cpp.c_float(nl_logit) if temp.value == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, From cd102e9da1a0e6159e5489f2cab23c207f4916a5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 21 May 2023 19:18:56 -0400 Subject: [PATCH 264/443] Cache shared library function calls for static tokens --- llama_cpp/llama.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2d405b7..7a152fd 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -198,6 +198,8 @@ class Llama: sorted=sorted, ) self._candidates = candidates + self._token_nl = Llama.token_nl() + self._token_eos = Llama.token_eos() def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: """Tokenize a string. @@ -327,7 +329,7 @@ class Llama: else last_n_tokens_size ) logits = self.eval_logits[-1] - nl_logit = logits[Llama.token_nl()] + nl_logit = logits[self._token_nl] candidates = self._candidates for i, logit in enumerate(logits): candidates.data[i].id = llama_cpp.llama_token(i) @@ -351,7 +353,7 @@ class Llama: alpha_presence=presence_penalty, ) if not penalize_nl: - candidates.data[Llama.token_nl()].logit = llama_cpp.c_float(nl_logit) + candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit) if temp.value == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, @@ -688,7 +690,7 @@ class Llama: presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, ): - if token == Llama.token_eos(): + if token == self._token_eos: text = self.detokenize(completion_tokens) finish_reason = "stop" break From 2c45255a0af4c2d1d58549cfa03fabf0b7bbe1bd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 21 May 2023 19:24:20 -0400 Subject: [PATCH 265/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3a09bc0..d79793a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.52" +version = "0.1.53" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index bcd01bf..ed89902 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.52", + version="0.1.53", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 0adb9ec37a3bc92e3a817b22a02a4bba30c4d82e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 21 May 2023 21:30:03 -0400 Subject: [PATCH 266/443] Use model_name and index in response --- llama_cpp/llama.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4344418..916fe07 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -176,7 +176,6 @@ class Llama: if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) - n_vocab = self.n_vocab() n_ctx = self.n_ctx() @@ -575,9 +574,9 @@ class Llama: else: inputs = input - data = [] + data: List[EmbeddingData] = [] total_tokens = 0 - for input in inputs: + for index, input in enumerate(inputs): tokens = self.tokenize(input.encode("utf-8")) self.reset() self.eval(tokens) @@ -587,20 +586,20 @@ class Llama: : llama_cpp.llama_n_embd(self.ctx) ] - if self.verbose: - llama_cpp.llama_print_timings(self.ctx) data.append( { "object": "embedding", "embedding": embedding, - "index": 0, + "index": index, } ) + if self.verbose: + llama_cpp.llama_print_timings(self.ctx) return { "object": "list", "data": data, - "model": self.model_path, + "model": model_name, "usage": { "prompt_tokens": total_tokens, "total_tokens": total_tokens, From e6639e6620d7f6f8dcabef41b14f34870a3460ca Mon Sep 17 00:00:00 2001 From: Marcel Coetzee Date: Mon, 22 May 2023 10:10:14 +0200 Subject: [PATCH 267/443] Change docker build dynamic param to image instead of cuda version Signed-off-by: Marcel Coetzee --- Dockerfile.cuda | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.cuda b/Dockerfile.cuda index e95fa23..dda7a9f 100644 --- a/Dockerfile.cuda +++ b/Dockerfile.cuda @@ -1,5 +1,5 @@ -ARG CUDA_VERSION=12.1.1 -FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 +ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04" +FROM ${CUDA_IMAGE} # We need to set the host to 0.0.0.0 to allow outside access ENV HOST 0.0.0.0 From 8e41d724aba87e5e77dae0ca9c974a3d1748d513 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 22 May 2023 21:05:39 +0000 Subject: [PATCH 268/443] Bump httpx from 0.24.0 to 0.24.1 Bumps [httpx](https://github.com/encode/httpx) from 0.24.0 to 0.24.1. - [Release notes](https://github.com/encode/httpx/releases) - [Changelog](https://github.com/encode/httpx/blob/master/CHANGELOG.md) - [Commits](https://github.com/encode/httpx/compare/0.24.0...0.24.1) --- updated-dependencies: - dependency-name: httpx dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 5289b29..9753cb2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -436,14 +436,14 @@ socks = ["socksio (>=1.0.0,<2.0.0)"] [[package]] name = "httpx" -version = "0.24.0" +version = "0.24.1" description = "The next generation HTTP client." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "httpx-0.24.0-py3-none-any.whl", hash = "sha256:447556b50c1921c351ea54b4fe79d91b724ed2b027462ab9a329465d147d5a4e"}, - {file = "httpx-0.24.0.tar.gz", hash = "sha256:507d676fc3e26110d41df7d35ebd8b3b8585052450f4097401c9be59d928c63e"}, + {file = "httpx-0.24.1-py3-none-any.whl", hash = "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd"}, + {file = "httpx-0.24.1.tar.gz", hash = "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"}, ] [package.dependencies] @@ -1439,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "d188fc14200f7ee348bef821265d676d584762983bcaf10f90c14221b4ed26a9" +content-hash = "d61c12cbbccd649f88ef156eb8a12080731c101c1dfc713396383764552e5872" diff --git a/pyproject.toml b/pyproject.toml index d79793a..66fda5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.21.2"} mkdocs-material = "^9.1.12" pytest = "^7.3.1" -httpx = "^0.24.0" +httpx = "^0.24.1" [build-system] requires = [ From 2240b949ae0df0832a2c860710a5e379a2dda2cb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 22 May 2023 21:18:57 +0000 Subject: [PATCH 269/443] Bump mkdocs-material from 9.1.12 to 9.1.14 Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.12 to 9.1.14. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.12...9.1.14) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 9753cb2..e2a7a89 100644 --- a/poetry.lock +++ b/poetry.lock @@ -773,14 +773,14 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.12" +version = "9.1.14" description = "Documentation that simply works" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.12-py3-none-any.whl", hash = "sha256:68c57d95d10104179c8c3ce9a88ee9d2322a5145b3d0f1f38ff686253fb5ec98"}, - {file = "mkdocs_material-9.1.12.tar.gz", hash = "sha256:d4ebe9b5031ce63a265c19fb5eab4d27ea4edadb05de206372e831b2b7570fb5"}, + {file = "mkdocs_material-9.1.14-py3-none-any.whl", hash = "sha256:b56a9f955ed32d38333715cbbf68ce38f683bf38610c65094fa4ef2db9f08bcd"}, + {file = "mkdocs_material-9.1.14.tar.gz", hash = "sha256:1ae74cc5464ef2f64574d4884512efed7f4db386fb9bc6af20fd427d7a702f49"}, ] [package.dependencies] @@ -1439,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "d61c12cbbccd649f88ef156eb8a12080731c101c1dfc713396383764552e5872" +content-hash = "e1749fc2c4926a4d78d2e0578e2c8075bfc7ccf905357ec1463b522e8536ab1a" diff --git a/pyproject.toml b/pyproject.toml index 66fda5b..ffdfc91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.21.2"} -mkdocs-material = "^9.1.12" +mkdocs-material = "^9.1.14" pytest = "^7.3.1" httpx = "^0.24.1" From c41b1ebca7a52d1eb6791ae5958457bcea8d67bc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 22 May 2023 23:50:35 -0400 Subject: [PATCH 270/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7e4ea5b..2e6cd4b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7e4ea5beff567f53be92f75f9089e6f11fa5dabd +Subproject commit 2e6cd4b02549e343bef3768e6b946f999c82e823 From e5d596e0e922efe45b5a152d934431520116300a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 22 May 2023 23:50:58 -0400 Subject: [PATCH 271/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ffdfc91..9633ffc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.53" +version = "0.1.54" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index ed89902..bd7192f 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.53", + version="0.1.54", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 327eedbfe16d950fc32d6e7a5573fd7f81cbbc74 Mon Sep 17 00:00:00 2001 From: Don Mahurin <@> Date: Mon, 22 May 2023 23:56:25 -0700 Subject: [PATCH 272/443] fix "from_bytes() missing required argument 'byteorder'" --- examples/low_level_api/low_level_api_chat_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 8773cb1..0e7cbe9 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -493,7 +493,7 @@ n_keep = {self.params.n_keep} # Contains multi-byte UTF8 for num, pattern in [(2, 192), (3, 224), (4, 240)]: # Bitwise AND check - if pattern & int.from_bytes(cur_char) == pattern: + if pattern & int.from_bytes(cur_char, 'little') == pattern: self.multibyte_fix = [cur_char] + ([None] * (num-1)) # Stop incomplete bytes from passing From d6a7adb17aa84b0be9a48e59a4d18295166bbfef Mon Sep 17 00:00:00 2001 From: Don Mahurin <@> Date: Mon, 22 May 2023 23:54:57 -0700 Subject: [PATCH 273/443] fix "missing 1 required positional argument: 'min_keep'" --- examples/low_level_api/low_level_api_chat_cpp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 8773cb1..a0736fe 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -368,10 +368,10 @@ n_keep = {self.params.n_keep} id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu)) else: # Temperature sampling - llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k) - llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z)) - llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p)) - llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p)) + llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), min_keep=llama_cpp.c_size_t(1)) llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) id = llama_cpp.llama_sample_token(self.ctx, candidates_p) # print("`{}`".format(candidates_p.size)) From eaff7a8678f655a41d978ba66cc0c805c8430684 Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Tue, 23 May 2023 19:26:40 +0000 Subject: [PATCH 274/443] Initial commit of auto docker --- docker/Dockerfile | 51 ++++++++++++++++++ docker/README.md | 33 ++++++++++++ docker/hug_model.py | 119 +++++++++++++++++++++++++++++++++++++++++ docker/start_server.sh | 11 ++++ 4 files changed, 214 insertions(+) create mode 100644 docker/Dockerfile create mode 100644 docker/README.md create mode 100644 docker/hug_model.py create mode 100755 docker/start_server.sh diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..f0ef5f7 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,51 @@ +# Define the image argument and provide a default value +ARG IMAGE=python:3-slim-bullseye + +# Use the image as specified +FROM ${IMAGE} + +# Re-declare the ARG after FROM +ARG IMAGE + +# Update and upgrade the existing packages +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + ninja-build \ + build-essential + +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette + +# Perform the conditional installations based on the image +RUN echo "Image: ${IMAGE}" && \ + if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \ + echo "OpenBLAS install:" && \ + apt-get install -y --no-install-recommends libopenblas-dev && \ + LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \ +else \ + echo "CuBLAS install:" && \ + LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \ +fi + +# Clean up apt cache +RUN rm -rf /var/lib/apt/lists/* + +# Set a working directory for better clarity +WORKDIR /app + +# Copy files to the app directory +RUN echo "Installing model...this can take some time..." +COPY ./model.bin /app/model.bin +COPY ./start_server.sh /app/start_server.sh + +# Make the server start script executable +RUN chmod +x /app/start_server.sh + +# Set environment variable for the host +ENV HOST=0.0.0.0 + +# Expose a port for the server +EXPOSE 8000 + +# Run the server start script +CMD ["/bin/sh", "/app/start_server.sh"] diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..445f264 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,33 @@ +# Get model from Hugging Face +`python3 ./hug_model.py` + +You should now have a model in the current directory and model.bin symlinked to it for the subsequent Docker build and copy step. e.g. +``` +docker $ ls -lh *.bin +-rw-rw-r-- 1 user user 4.8G May 23 18:30 llama-7b.ggmlv3.q5_1.bin +lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> .q5_1.bin +``` +- Note #1: Make sure you have enough disk space to d/l the model. As the model is then copied into the image you will need at least +**TWICE** as much disk space as the size of the model: + +| Model | Quantized size | +|------:|----------------:| +| 7B | 5 GB | +| 13B | 10 GB | +| 30B | 25 GB | +| 65B | 50 GB | + +- Note #2: If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` + +# Use OpenBLAS (No NVidia GPU, defaults to `python:3-slim-bullseye` Docker base image) +## Build: +`docker build --build-arg -t openblas .` +## Run: +`docker run --cap-add SYS_RESOURCE -t openblas` + +# Use CuBLAS +Requires NVidia GPU and Docker NVidia support (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) +## Build: +`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t opencuda .` +## Run: +`docker run --cap-add SYS_RESOURCE -t cublas` diff --git a/docker/hug_model.py b/docker/hug_model.py new file mode 100644 index 0000000..476f53c --- /dev/null +++ b/docker/hug_model.py @@ -0,0 +1,119 @@ +import requests +import json +import os +import struct + +def make_request(url, params=None): + print(f"Making request to {url}...") + response = requests.get(url, params=params) + if response.status_code == 200: + return json.loads(response.text) + else: + print(f"Request failed with status code {response.status_code}") + return None + +def check_magic_and_version(filename): + with open(filename, 'rb') as f: + # Read the first 6 bytes from the file + data = f.read(6) + + # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int + # and the next 2 bytes as a little-endian unsigned short + magic, version = struct.unpack('= 10485760: # 10 MB + print('.', end='', flush=True) + total_downloaded = 0 + print("\nDownload complete.") + + # Creating a symbolic link from destination to "model.bin" + if os.path.isfile("model.bin"): + os.remove("model.bin") # remove the existing link if any + os.symlink(destination, "model.bin") + else: + print(f"Download failed with status code {response.status_code}") + +def get_user_choice(model_list): + # Print the enumerated list + print("\n") + for i, (model_id, rfilename) in enumerate(model_list): + print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}") + + # Get user's choice + choice = input("Choose a model to download by entering the corresponding number: ") + try: + index = int(choice) - 1 + if 0 <= index < len(model_list): + # Return the chosen model + return model_list[index] + else: + print("Invalid choice.") + except ValueError: + print("Invalid input. Please enter a number corresponding to a model.") + except IndexError: + print("Invalid choice. Index out of range.") + + return None + +import argparse + +def main(): + # Create an argument parser + parser = argparse.ArgumentParser(description='Process the model version.') + parser.add_argument('-v', '--version', type=int, default=0x0003, + help='an integer for the version to be used') + + # Parse the arguments + args = parser.parse_args() + + # Define the parameters + params = { + "author": "TheBloke", # Filter by author + "tags": "llama" + } + + models = make_request('https://huggingface.co/api/models', params=params) + if models is None: + return + + model_list = [] + # Iterate over the models + for model in models: + model_id = model['id'] + model_info = make_request(f'https://huggingface.co/api/models/{model_id}') + if model_info is None: + continue + + for sibling in model_info.get('siblings', []): + rfilename = sibling.get('rfilename') + if rfilename and 'q5_1' in rfilename: + model_list.append((model_id, rfilename)) + + model_choice = get_user_choice(model_list) + if model_choice is not None: + model_id, rfilename = model_choice + url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" + download_file(url, rfilename) + _, version = check_magic_and_version(rfilename) + if version != args.version: + print(f"Warning: Expected version {args.version}, but found different version in the file.") + +if __name__ == '__main__': + main() diff --git a/docker/start_server.sh b/docker/start_server.sh new file mode 100755 index 0000000..176bd87 --- /dev/null +++ b/docker/start_server.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +# For mmap support +ulimit -l unlimited + +if [ "$IMAGE" = "python:3-slim-bullseye" ]; then + python3 -B -m llama_cpp.server --model /app/model.bin +else + # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM + python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000 +fi From 70f629a72fe1dae576988a8107f683c66c887d7f Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Tue, 23 May 2023 20:36:21 +0100 Subject: [PATCH 275/443] Update README.md --- docker/README.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/docker/README.md b/docker/README.md index 445f264..3a538af 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,13 +1,13 @@ # Get model from Hugging Face `python3 ./hug_model.py` -You should now have a model in the current directory and model.bin symlinked to it for the subsequent Docker build and copy step. e.g. +You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. ``` docker $ ls -lh *.bin --rw-rw-r-- 1 user user 4.8G May 23 18:30 llama-7b.ggmlv3.q5_1.bin +-rw-rw-r-- 1 user user 4.8G May 23 18:30 .q5_1.bin lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> .q5_1.bin ``` -- Note #1: Make sure you have enough disk space to d/l the model. As the model is then copied into the image you will need at least +**Note #1:** Make sure you have enough disk space to d/l the model. As the model is then copied into the image you will need at least **TWICE** as much disk space as the size of the model: | Model | Quantized size | @@ -17,16 +17,23 @@ lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> .q5 | 30B | 25 GB | | 65B | 50 GB | -- Note #2: If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` +**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` -# Use OpenBLAS (No NVidia GPU, defaults to `python:3-slim-bullseye` Docker base image) +# Install Docker Server + +**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this README with a PR! + +[Install Docker Engine](https://docs.docker.com/engine/install) + +# Use OpenBLAS +No NVidia GPU, defaults to `python:3-slim-bullseye` Docker base image and OpenBlAS: ## Build: `docker build --build-arg -t openblas .` ## Run: `docker run --cap-add SYS_RESOURCE -t openblas` # Use CuBLAS -Requires NVidia GPU and Docker NVidia support (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) +Requires NVidia GPU and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) ## Build: `docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t opencuda .` ## Run: From ed19071ef8439d876bde415852cd53ba0a863ebd Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Tue, 23 May 2023 19:38:37 +0000 Subject: [PATCH 276/443] Renamed and moved old Dockerfiles --- Dockerfile.cuda => docker/Dockerfile.cuda_simple | 0 Dockerfile => docker/Dockerfile.openblas_simple | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename Dockerfile.cuda => docker/Dockerfile.cuda_simple (100%) rename Dockerfile => docker/Dockerfile.openblas_simple (100%) diff --git a/Dockerfile.cuda b/docker/Dockerfile.cuda_simple similarity index 100% rename from Dockerfile.cuda rename to docker/Dockerfile.cuda_simple diff --git a/Dockerfile b/docker/Dockerfile.openblas_simple similarity index 100% rename from Dockerfile rename to docker/Dockerfile.openblas_simple From ec44bdad614c68b3b3f904ff04ecc68ea158ff3e Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Tue, 23 May 2023 20:50:39 +0100 Subject: [PATCH 277/443] Update README.md --- docker/README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/docker/README.md b/docker/README.md index 3a538af..100bcbd 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,3 +1,9 @@ +# Dockerfiles for building the llama-cpp-python server +- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS +- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS +- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke) +- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` + # Get model from Hugging Face `python3 ./hug_model.py` @@ -7,7 +13,7 @@ docker $ ls -lh *.bin -rw-rw-r-- 1 user user 4.8G May 23 18:30 .q5_1.bin lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> .q5_1.bin ``` -**Note #1:** Make sure you have enough disk space to d/l the model. As the model is then copied into the image you will need at least +**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least **TWICE** as much disk space as the size of the model: | Model | Quantized size | @@ -21,20 +27,20 @@ lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> .q5 # Install Docker Server -**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this README with a PR! +**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR! [Install Docker Engine](https://docs.docker.com/engine/install) # Use OpenBLAS -No NVidia GPU, defaults to `python:3-slim-bullseye` Docker base image and OpenBlAS: +Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS: ## Build: `docker build --build-arg -t openblas .` ## Run: `docker run --cap-add SYS_RESOURCE -t openblas` # Use CuBLAS -Requires NVidia GPU and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) +Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) ## Build: -`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t opencuda .` +`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .` ## Run: `docker run --cap-add SYS_RESOURCE -t cublas` From fab064ded91209f7f1e3fe5ff5b247db891c446a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 23 May 2023 17:56:21 -0400 Subject: [PATCH 278/443] Remove unnecessary ffi calls --- llama_cpp/llama.py | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 916fe07..43fa9c7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -177,19 +177,19 @@ class Llama: if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) - n_vocab = self.n_vocab() - n_ctx = self.n_ctx() - data = (llama_cpp.llama_token_data * n_vocab)( + self._n_vocab = self.n_vocab() + self._n_ctx = self.n_ctx() + data = (llama_cpp.llama_token_data * self._n_vocab)( *[ llama_cpp.llama_token_data( id=llama_cpp.llama_token(i), logit=llama_cpp.c_float(0.0), p=llama_cpp.c_float(0.0), ) - for i in range(n_vocab) + for i in range(self._n_vocab) ] ) - size = llama_cpp.c_size_t(n_vocab) + size = llama_cpp.c_size_t(self._n_vocab) sorted = False candidates = llama_cpp.llama_token_data_array( data=data, @@ -213,8 +213,8 @@ class Llama: A list of tokens. """ assert self.ctx is not None - n_ctx = llama_cpp.llama_n_ctx(self.ctx) - tokens = (llama_cpp.llama_token * int(n_ctx))() + n_ctx = self._n_ctx + tokens = (llama_cpp.llama_token * n_ctx)() n_tokens = llama_cpp.llama_tokenize( self.ctx, text, @@ -222,9 +222,9 @@ class Llama: llama_cpp.c_int(n_ctx), llama_cpp.c_bool(add_bos), ) - if int(n_tokens) < 0: + if n_tokens < 0: n_tokens = abs(n_tokens) - tokens = (llama_cpp.llama_token * int(n_tokens))() + tokens = (llama_cpp.llama_token * n_tokens)() n_tokens = llama_cpp.llama_tokenize( self.ctx, text, @@ -275,7 +275,7 @@ class Llama: tokens: The list of tokens to evaluate. """ assert self.ctx is not None - n_ctx = int(llama_cpp.llama_n_ctx(self.ctx)) + n_ctx = self._n_ctx for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] n_past = min(n_ctx - len(batch), len(self.eval_tokens)) @@ -287,18 +287,16 @@ class Llama: n_past=llama_cpp.c_int(n_past), n_threads=llama_cpp.c_int(self.n_threads), ) - if int(return_code) != 0: + if return_code != 0: raise RuntimeError(f"llama_eval returned {return_code}") # Save tokens self.eval_tokens.extend(batch) # Save logits rows = n_tokens if self.params.logits_all else 1 - n_vocab = llama_cpp.llama_n_vocab(self.ctx) - cols = int(n_vocab) + n_vocab = self._n_vocab + cols = n_vocab logits_view = llama_cpp.llama_get_logits(self.ctx) - logits: List[List[float]] = [ - [logits_view[i * cols + j] for j in range(cols)] for i in range(rows) - ] + logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)] self.eval_logits.extend(logits) def _sample( @@ -319,8 +317,8 @@ class Llama: ): assert self.ctx is not None assert len(self.eval_logits) > 0 - n_vocab = self.n_vocab() - n_ctx = self.n_ctx() + n_vocab = self._n_vocab + n_ctx = self._n_ctx top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k last_n_tokens_size = ( llama_cpp.c_int(n_ctx) @@ -654,9 +652,9 @@ class Llama: if self.verbose: llama_cpp.llama_reset_timings(self.ctx) - if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)): + if len(prompt_tokens) + max_tokens > self._n_ctx: raise ValueError( - f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}" + f"Requested tokens exceed context window of {self._n_ctx}" ) if stop != []: From 5bb780d455d4158870c231a7fde1fa16863361f1 Mon Sep 17 00:00:00 2001 From: Maximilian-Winter Date: Wed, 24 May 2023 21:55:44 +0200 Subject: [PATCH 279/443] Implemented logit processors and stop criteria's --- llama_cpp/llama.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 916fe07..cf1e719 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -316,6 +316,7 @@ class Llama: mirostat_tau: llama_cpp.c_float, mirostat_eta: llama_cpp.c_float, penalize_nl: bool = True, + logits_processors=None ): assert self.ctx is not None assert len(self.eval_logits) > 0 @@ -328,6 +329,10 @@ class Llama: else last_n_tokens_size ) logits = self.eval_logits[-1] + for processor in logits_processors: + logits = processor(list(self.eval_tokens), logits) + + self.eval_logits[-1] = logits nl_logit = logits[self._token_nl] candidates = self._candidates for i, logit in enumerate(logits): @@ -436,6 +441,8 @@ class Llama: mirostat_eta: float = 0.1, mirostat_tau: float = 5.0, penalize_nl: bool = True, + logits_processors=None + ): """Sample a token from the model. @@ -468,6 +475,8 @@ class Llama: mirostat_tau=llama_cpp.c_float(mirostat_tau), mirostat_eta=llama_cpp.c_float(mirostat_eta), penalize_nl=penalize_nl, + logits_processors=logits_processors + ) def generate( @@ -484,6 +493,7 @@ class Llama: mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + logits_processors=None ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -541,6 +551,7 @@ class Llama: mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + logits_processors=logits_processors ) tokens_or_none = yield token tokens = [token] @@ -637,6 +648,8 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, + logits_processors=None, + stopping_criterias=None ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None completion_id: str = f"cmpl-{str(uuid.uuid4())}" @@ -700,6 +713,7 @@ class Llama: frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, + logits_processors=logits_processors ): if token == self._token_eos: text = self.detokenize(completion_tokens) @@ -707,6 +721,14 @@ class Llama: break completion_tokens.append(token) + for stopping_crit in stopping_criterias: + if stopping_crit(completion_tokens, None): + text = self.detokenize(completion_tokens) + finish_reason = "stop" + break + + if finish_reason == "stop": + break all_text = self.detokenize(completion_tokens) @@ -1006,6 +1028,8 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, + logits_processors=None, + stopping_criterias=None ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -1048,6 +1072,9 @@ class Llama: mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, model=model, + logits_processors=logits_processors, + stopping_criterias=stopping_criterias + ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks From c05fcdf42f991d5c43dea3377dc1529adcd45167 Mon Sep 17 00:00:00 2001 From: Maximilian-Winter Date: Wed, 24 May 2023 22:02:06 +0200 Subject: [PATCH 280/443] Fixed none value of logits processors. --- llama_cpp/llama.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index cf1e719..c6f540c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -320,6 +320,10 @@ class Llama: ): assert self.ctx is not None assert len(self.eval_logits) > 0 + + if logits_processors == None: + logits_processors = [] + n_vocab = self.n_vocab() n_ctx = self.n_ctx() top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k @@ -652,6 +656,10 @@ class Llama: stopping_criterias=None ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None + + if stopping_criterias == None: + stopping_criterias = [] + completion_id: str = f"cmpl-{str(uuid.uuid4())}" created: int = int(time.time()) completion_tokens: List[int] = [] From da463e6c8c3c09c7a32bf25d924974d74f3d2776 Mon Sep 17 00:00:00 2001 From: Maximilian-Winter Date: Thu, 25 May 2023 09:07:16 +0200 Subject: [PATCH 281/443] Added types to logit processor list and stop criteria list --- llama_cpp/llama.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c6f540c..8176136 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -4,7 +4,7 @@ import uuid import time import math import multiprocessing -from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple +from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple, Callable from collections import deque, OrderedDict from . import llama_cpp @@ -316,12 +316,11 @@ class Llama: mirostat_tau: llama_cpp.c_float, mirostat_eta: llama_cpp.c_float, penalize_nl: bool = True, - logits_processors=None + logits_processors: List[Callable[[List[llama_cpp.c_int], List[llama_cpp.c_float]], List[float]]] = None ): assert self.ctx is not None assert len(self.eval_logits) > 0 - - if logits_processors == None: + if logits_processors is None: logits_processors = [] n_vocab = self.n_vocab() @@ -445,7 +444,7 @@ class Llama: mirostat_eta: float = 0.1, mirostat_tau: float = 5.0, penalize_nl: bool = True, - logits_processors=None + logits_processors: List[Callable[[List[llama_cpp.c_int], List[llama_cpp.c_float]], List[float]]] = None ): """Sample a token from the model. @@ -497,7 +496,7 @@ class Llama: mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, - logits_processors=None + logits_processors: List[Callable[[List[llama_cpp.c_int], List[llama_cpp.c_float]], List[float]]] = None ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -652,12 +651,12 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, - logits_processors=None, - stopping_criterias=None + logits_processors: List[Callable[[List[llama_cpp.c_int], List[llama_cpp.c_float]], List[float]]] = None, + stopping_criterias: List[Callable[[List[int], List[llama_cpp.c_float]], bool]] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None - if stopping_criterias == None: + if stopping_criterias is None: stopping_criterias = [] completion_id: str = f"cmpl-{str(uuid.uuid4())}" @@ -1036,8 +1035,8 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, - logits_processors=None, - stopping_criterias=None + logits_processors: List[Callable[[List[llama_cpp.c_int], List[llama_cpp.c_float]], List[float]]] = None, + stopping_criterias: List[Callable[[List[int], List[llama_cpp.c_float]], bool]] = None ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. From c2585b68894102ace0cb0c54dc812e27c36482b9 Mon Sep 17 00:00:00 2001 From: Maximilian-Winter Date: Thu, 25 May 2023 10:54:08 +0200 Subject: [PATCH 282/443] Fixed list elements typing --- llama_cpp/llama.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 8176136..144671b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -316,7 +316,7 @@ class Llama: mirostat_tau: llama_cpp.c_float, mirostat_eta: llama_cpp.c_float, penalize_nl: bool = True, - logits_processors: List[Callable[[List[llama_cpp.c_int], List[llama_cpp.c_float]], List[float]]] = None + logits_processors: List[Callable[[List[int], List[float]], List[float]]] = None ): assert self.ctx is not None assert len(self.eval_logits) > 0 @@ -444,7 +444,7 @@ class Llama: mirostat_eta: float = 0.1, mirostat_tau: float = 5.0, penalize_nl: bool = True, - logits_processors: List[Callable[[List[llama_cpp.c_int], List[llama_cpp.c_float]], List[float]]] = None + logits_processors: List[Callable[[List[int], List[float]], List[float]]] = None ): """Sample a token from the model. @@ -496,7 +496,7 @@ class Llama: mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, - logits_processors: List[Callable[[List[llama_cpp.c_int], List[llama_cpp.c_float]], List[float]]] = None + logits_processors: List[Callable[[List[int], List[float]], List[float]]] = None ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -651,8 +651,8 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, - logits_processors: List[Callable[[List[llama_cpp.c_int], List[llama_cpp.c_float]], List[float]]] = None, - stopping_criterias: List[Callable[[List[int], List[llama_cpp.c_float]], bool]] = None, + logits_processors: List[Callable[[List[int], List[float]], List[float]]] = None, + stopping_criterias: List[Callable[[List[int], List[float]], bool]] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None @@ -1035,8 +1035,8 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, - logits_processors: List[Callable[[List[llama_cpp.c_int], List[llama_cpp.c_float]], List[float]]] = None, - stopping_criterias: List[Callable[[List[int], List[llama_cpp.c_float]], bool]] = None + logits_processors: List[Callable[[List[int], List[float]], List[float]]] = None, + stopping_criterias: List[Callable[[List[int], List[float]], bool]] = None ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. From 0d2cc21202620e92fd152981c6b7ecc0190a6124 Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Thu, 25 May 2023 11:50:02 +0000 Subject: [PATCH 283/443] Fixed repeated imports --- docker/hug_model.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/docker/hug_model.py b/docker/hug_model.py index 476f53c..848a1aa 100644 --- a/docker/hug_model.py +++ b/docker/hug_model.py @@ -25,9 +25,6 @@ def check_magic_and_version(filename): return magic, version -import os -import requests - def download_file(url, destination): print(f"Downloading {url} to {destination}...") response = requests.get(url, stream=True) From 1d247e0f350948667553f3c880f8df40f0b5c787 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 25 May 2023 14:04:54 -0400 Subject: [PATCH 284/443] Add StoppingCriteria and LogitsProcessor to generate to match huggingface API --- llama_cpp/llama.py | 74 ++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 32 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 144671b..b7a8d79 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -4,7 +4,17 @@ import uuid import time import math import multiprocessing -from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple, Callable +from typing import ( + List, + Optional, + Union, + Generator, + Sequence, + Iterator, + Deque, + Tuple, + Callable, +) from collections import deque, OrderedDict from . import llama_cpp @@ -72,6 +82,24 @@ class LlamaState: self.llama_state_size = llama_state_size +LogitsProcessor = Callable[[List[int], List[float]], List[float]] + + +class LogitsProcessorList(List[LogitsProcessor]): + def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: + for processor in self: + scores = processor(input_ids, scores) + return scores + + +StoppingCriteria = Callable[[List[int], List[float]], bool] + + +class StoppingCriteriaList(List[StoppingCriteria]): + def __call__(self, input_ids: List[int], logits: List[float]) -> bool: + return any([stopping_criteria(input_ids, logits) for stopping_criteria in self]) + + class Llama: """High-level Python wrapper for a llama.cpp model.""" @@ -316,12 +344,10 @@ class Llama: mirostat_tau: llama_cpp.c_float, mirostat_eta: llama_cpp.c_float, penalize_nl: bool = True, - logits_processors: List[Callable[[List[int], List[float]], List[float]]] = None + logits_processor: Optional[LogitsProcessorList] = None, ): assert self.ctx is not None assert len(self.eval_logits) > 0 - if logits_processors is None: - logits_processors = [] n_vocab = self.n_vocab() n_ctx = self.n_ctx() @@ -332,10 +358,10 @@ class Llama: else last_n_tokens_size ) logits = self.eval_logits[-1] - for processor in logits_processors: - logits = processor(list(self.eval_tokens), logits) - self.eval_logits[-1] = logits + if logits_processor is not None: + logits = logits_processor(list(self.eval_tokens), logits) + nl_logit = logits[self._token_nl] candidates = self._candidates for i, logit in enumerate(logits): @@ -444,8 +470,7 @@ class Llama: mirostat_eta: float = 0.1, mirostat_tau: float = 5.0, penalize_nl: bool = True, - logits_processors: List[Callable[[List[int], List[float]], List[float]]] = None - + logits_processor: Optional[LogitsProcessorList] = None, ): """Sample a token from the model. @@ -478,8 +503,7 @@ class Llama: mirostat_tau=llama_cpp.c_float(mirostat_tau), mirostat_eta=llama_cpp.c_float(mirostat_eta), penalize_nl=penalize_nl, - logits_processors=logits_processors - + logits_processor=logits_processor, ) def generate( @@ -496,7 +520,8 @@ class Llama: mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, - logits_processors: List[Callable[[List[int], List[float]], List[float]]] = None + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -554,8 +579,12 @@ class Llama: mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, - logits_processors=logits_processors + logits_processor=logits_processor, ) + if stopping_criteria is not None and stopping_criteria( + list(self.eval_tokens), self.eval_logits[-1] + ): + return tokens_or_none = yield token tokens = [token] if tokens_or_none is not None: @@ -651,14 +680,9 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, - logits_processors: List[Callable[[List[int], List[float]], List[float]]] = None, - stopping_criterias: List[Callable[[List[int], List[float]], bool]] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None - if stopping_criterias is None: - stopping_criterias = [] - completion_id: str = f"cmpl-{str(uuid.uuid4())}" created: int = int(time.time()) completion_tokens: List[int] = [] @@ -720,7 +744,6 @@ class Llama: frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, - logits_processors=logits_processors ): if token == self._token_eos: text = self.detokenize(completion_tokens) @@ -728,14 +751,6 @@ class Llama: break completion_tokens.append(token) - for stopping_crit in stopping_criterias: - if stopping_crit(completion_tokens, None): - text = self.detokenize(completion_tokens) - finish_reason = "stop" - break - - if finish_reason == "stop": - break all_text = self.detokenize(completion_tokens) @@ -1035,8 +1050,6 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, - logits_processors: List[Callable[[List[int], List[float]], List[float]]] = None, - stopping_criterias: List[Callable[[List[int], List[float]], bool]] = None ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -1079,9 +1092,6 @@ class Llama: mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, model=model, - logits_processors=logits_processors, - stopping_criterias=stopping_criterias - ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks From ca01f98e09f2f4146d8adb19efbd48460a99068c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 25 May 2023 14:11:33 -0400 Subject: [PATCH 285/443] Add LlamaTokenizer class --- llama_cpp/llama.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index b7a8d79..7dd1acb 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1380,6 +1380,11 @@ class Llama: assert self.ctx is not None return llama_cpp.llama_n_vocab(self.ctx) + def tokenizer(self) -> "LlamaTokenizer": + """Return the tokenizer for this model.""" + assert self.ctx is not None + return LlamaTokenizer(self) + @staticmethod def token_eos() -> int: """Return the end-of-sequence token.""" @@ -1410,3 +1415,18 @@ class Llama: else: break return longest_prefix + + +class LlamaTokenizer: + def __init__(self, llama: Llama): + self.llama = llama + + def encode(self, text: str) -> List[int]: + return self.llama.tokenize(text.encode("utf-8", errors="ignore")) + + def decode(self, tokens: List[int]) -> str: + return self.llama.detokenize(tokens).decode("utf-8", errors="ignore") + + @classmethod + def from_ggml_file(cls, path: str) -> "LlamaTokenizer": + return cls(Llama(model_path=path, vocab_only=True)) From 8fa2ef195970a1455b78389fa0840da3380b77b3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 03:00:35 -0400 Subject: [PATCH 286/443] Format --- llama_cpp/llama.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index b43dfe7..0978e1e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -696,9 +696,7 @@ class Llama: llama_cpp.llama_reset_timings(self.ctx) if len(prompt_tokens) + max_tokens > self._n_ctx: - raise ValueError( - f"Requested tokens exceed context window of {self._n_ctx}" - ) + raise ValueError(f"Requested tokens exceed context window of {self._n_ctx}") if stop != []: stop_sequences = [s.encode("utf-8") for s in stop] From 5be8354e11e5b5cf99963eefc2c13541d60c0634 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 03:00:51 -0400 Subject: [PATCH 287/443] Added tokenizer --- llama_cpp/llama.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 0978e1e..82246d1 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1416,8 +1416,10 @@ class LlamaTokenizer: def __init__(self, llama: Llama): self.llama = llama - def encode(self, text: str) -> List[int]: - return self.llama.tokenize(text.encode("utf-8", errors="ignore")) + def encode(self, text: str, add_bos: bool = True) -> List[int]: + return self.llama.tokenize( + text.encode("utf-8", errors="ignore"), add_bos=add_bos + ) def decode(self, tokens: List[int]) -> str: return self.llama.detokenize(tokens).decode("utf-8", errors="ignore") From f74b90ed6767957ac0eb1b5364196a22e10166de Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 03:03:01 -0400 Subject: [PATCH 288/443] Fix streaming hang on last token when cache is on. --- llama_cpp/llama.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 82246d1..f4b2d49 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -848,11 +848,6 @@ class Llama: finish_reason = "length" break - if self.cache: - if self.verbose: - print("Llama._create_completion: cache save", file=sys.stderr) - self.cache[prompt_tokens + completion_tokens] = self.save_state() - if self.verbose: llama_cpp.llama_print_timings(self.ctx) @@ -941,8 +936,17 @@ class Llama: } ], } + if self.cache: + if self.verbose: + print("Llama._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() return + if self.cache: + if self.verbose: + print("Llama._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() + text_str = text.decode("utf-8", errors="ignore") if echo: From 30bf8ec55776d6254ec4b2146ea5585db6010459 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 03:03:11 -0400 Subject: [PATCH 289/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2e6cd4b..66874d4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2e6cd4b02549e343bef3768e6b946f999c82e823 +Subproject commit 66874d4fbcc7866377246efbcee938e8cc9c7d76 From 433a2e3e8a518bfb0eff21af23933014ce7a5b20 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 03:13:24 -0400 Subject: [PATCH 290/443] Add extra logits_processor and stopping_criteria --- llama_cpp/llama.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index f4b2d49..f61b077 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -677,6 +677,8 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None @@ -739,6 +741,8 @@ class Llama: frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ): if token == self._token_eos: text = self.detokenize(completion_tokens) @@ -848,6 +852,11 @@ class Llama: finish_reason = "length" break + if stopping_criteria is not None and stopping_criteria( + list(self.eval_tokens), self.eval_logits[-1] + ): + finish_reason = "stop" + if self.verbose: llama_cpp.llama_print_timings(self.ctx) @@ -1049,6 +1058,8 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -1091,6 +1102,8 @@ class Llama: mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, model=model, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks @@ -1118,6 +1131,8 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -1160,6 +1175,8 @@ class Llama: mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, model=model, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ) def _convert_text_completion_to_chat( From 0fa2ec490396447afa2fc7da3dc5033759c888a6 Mon Sep 17 00:00:00 2001 From: Don Mahurin <@> Date: Fri, 26 May 2023 06:35:15 -0700 Subject: [PATCH 291/443] low_level_api_chat_cpp.py: Fix missing antiprompt output in chat. --- examples/low_level_api/low_level_api_chat_cpp.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 8773cb1..756609e 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -382,12 +382,15 @@ n_keep = {self.params.n_keep} # replace end of text token with newline token when in interactive mode if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct): id = self.llama_token_newline[0] + self.embd.append(id) if (self.use_antiprompt()): # tokenize and inject first reverse prompt self.embd_inp += self.first_antiprompt[0] - - # add it to the context - self.embd.append(id) + for id in self.first_antiprompt[0]: + self.embd.append(id) + else: + # add it to the context + self.embd.append(id) # echo this to console self.output_echo = True From 4c1b7f7a76df3459c958a9da640b84fac2110c86 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 10:25:28 -0400 Subject: [PATCH 292/443] Bugfix for logits_processor and stopping_criteria --- llama_cpp/llama.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index f61b077..012bb86 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -358,6 +358,7 @@ class Llama: if logits_processor is not None: logits = logits_processor(list(self.eval_tokens), logits) + self.eval_logits[-1] = logits nl_logit = logits[self._token_nl] candidates = self._candidates @@ -855,6 +856,7 @@ class Llama: if stopping_criteria is not None and stopping_criteria( list(self.eval_tokens), self.eval_logits[-1] ): + text = self.detokenize(completion_tokens) finish_reason = "stop" if self.verbose: From 8eb9769f78465ae0926d5f7d28cc368b877be96d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 16:12:45 -0400 Subject: [PATCH 293/443] Add support for numpy --- llama_cpp/llama.py | 57 ++++++++++++++++++++++++++++++---------------- setup.py | 4 +--- 2 files changed, 39 insertions(+), 22 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 012bb86..6babebd 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -20,6 +20,9 @@ from collections import deque, OrderedDict from . import llama_cpp from .llama_types import * +import numpy as np +import numpy.typing as npt + class LlamaCache: """Cache for a llama.cpp model.""" @@ -73,11 +76,15 @@ class LlamaState: self, eval_tokens: Deque[int], eval_logits: Deque[List[float]], + input_ids: npt.NDArray[np.intc], + scores: npt.NDArray[np.single], llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] llama_state_size: int, ): self.eval_tokens = eval_tokens self.eval_logits = eval_logits + self.input_ids = input_ids + self.scores = scores self.llama_state = llama_state self.llama_state_size = llama_state_size @@ -207,20 +214,14 @@ class Llama: self._n_vocab = self.n_vocab() self._n_ctx = self.n_ctx() - data = (llama_cpp.llama_token_data * self._n_vocab)( - *[ - llama_cpp.llama_token_data( - id=llama_cpp.llama_token(i), - logit=llama_cpp.c_float(0.0), - p=llama_cpp.c_float(0.0), - ) - for i in range(self._n_vocab) - ] - ) size = llama_cpp.c_size_t(self._n_vocab) - sorted = False + sorted = llama_cpp.c_bool(False) + self._candidates_data = np.array( + [], dtype=[("id", np.intc), ("logit", np.single), ("p", np.single)] + ) + self._candidates_data.resize(3, self._n_vocab) candidates = llama_cpp.llama_token_data_array( - data=data, + data=self._candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p), size=size, sorted=sorted, ) @@ -228,6 +229,9 @@ class Llama: self._token_nl = Llama.token_nl() self._token_eos = Llama.token_eos() + self._input_ids = np.array([], dtype=np.intc) + self._scores = np.ndarray((0, self._n_vocab), dtype=np.single) + def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: """Tokenize a string. @@ -319,6 +323,9 @@ class Llama: raise RuntimeError(f"llama_eval returned {return_code}") # Save tokens self.eval_tokens.extend(batch) + self._input_ids: npt.NDArray[np.intc] = np.concatenate( + (self._input_ids, np.array(batch, dtype=np.intc)), axis=0 + ) # Save logits rows = n_tokens if self.params.logits_all else 1 n_vocab = self._n_vocab @@ -326,6 +333,9 @@ class Llama: logits_view = llama_cpp.llama_get_logits(self.ctx) logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)] self.eval_logits.extend(logits) + self._scores: npt.NDArray[np.single] = np.concatenate( + (self._scores, np.array(logits, dtype=np.single)), axis=0 + ) def _sample( self, @@ -354,18 +364,23 @@ class Llama: if last_n_tokens_size.value < 0 else last_n_tokens_size ) - logits = self.eval_logits[-1] + logits: npt.NDArray[np.single] = self._scores[-1, :] if logits_processor is not None: - logits = logits_processor(list(self.eval_tokens), logits) - self.eval_logits[-1] = logits + logits = np.array( + logits_processor(list(self.eval_tokens), logits.tolist()), + dtype=np.single, + ) + self._scores[-1, :] = logits + self.eval_logits[-1] = logits.tolist() nl_logit = logits[self._token_nl] candidates = self._candidates - for i, logit in enumerate(logits): - candidates.data[i].id = llama_cpp.llama_token(i) - candidates.data[i].logit = llama_cpp.c_float(logit) - candidates.data[i].p = llama_cpp.c_float(0.0) + candidates_data = self._candidates_data + candidates_data["id"] = np.arange(n_vocab, dtype=np.intc) # type: ignore + candidates_data["logit"] = logits + candidates_data["p"] = np.zeros(n_vocab, dtype=np.single) + candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p) candidates.sorted = llama_cpp.c_bool(False) candidates.size = llama_cpp.c_size_t(n_vocab) llama_cpp.llama_sample_repetition_penalty( @@ -1371,6 +1386,8 @@ class Llama: return LlamaState( eval_tokens=self.eval_tokens.copy(), eval_logits=self.eval_logits.copy(), + scores=self._scores.copy(), + input_ids=self._input_ids.copy(), llama_state=llama_state_compact, llama_state_size=n_bytes, ) @@ -1379,6 +1396,8 @@ class Llama: assert self.ctx is not None self.eval_tokens = state.eval_tokens.copy() self.eval_logits = state.eval_logits.copy() + self._scores = state.scores.copy() + self._input_ids = state.input_ids.copy() state_size = state.llama_state_size if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size: raise RuntimeError("Failed to set llama state data") diff --git a/setup.py b/setup.py index bd7192f..198dd74 100644 --- a/setup.py +++ b/setup.py @@ -16,9 +16,7 @@ setup( license="MIT", package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"}, packages=["llama_cpp", "llama_cpp.server"], - install_requires=[ - "typing-extensions>=4.5.0", - ], + install_requires=["typing-extensions>=4.5.0", "numpy>=1.24.2"], extras_require={ "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], }, From bd4b95da45aa129277cdba0ccdab10a1af99c2e5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 16:38:21 -0400 Subject: [PATCH 294/443] Reduce numpy version dependency --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 198dd74..c51202e 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ setup( license="MIT", package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"}, packages=["llama_cpp", "llama_cpp.server"], - install_requires=["typing-extensions>=4.5.0", "numpy>=1.24.2"], + install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0"], extras_require={ "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], }, From 6075e17cb6c74cc48369a65cc742dda607ebc43f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 17:21:51 -0400 Subject: [PATCH 295/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9633ffc..895a644 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.54" +version = "0.1.55" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index bd7192f..2136d8d 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.54", + version="0.1.55", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 030fafe901f5384b8e51a5bf082493c64e95e6a1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 17:32:34 -0400 Subject: [PATCH 296/443] Add project changelog --- CHANGELOG.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..6eb04cd --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,12 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- Added first version of the changelog \ No newline at end of file From 00ea3af51b7f93495a852288487b378455a024d5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 17:56:20 -0400 Subject: [PATCH 297/443] Add makefile --- Makefile | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c1fe552 --- /dev/null +++ b/Makefile @@ -0,0 +1,49 @@ +update: + poetry install + git submodule update --init --recursive + +update.vendor: + cd vendor/llama.cpp && git pull origin master + +build: + python3 setup.py develop + +build.cuda: + CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop + +build.opencl: + CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop + +build.openblas: + CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop + +build.blis: + CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop + +build.sdist: + python3 setup.py sdist + +deploy.pypi: + python3 -m twine upload dist/* + +deploy.gh-docs: + mkdocs build + mkdocs gh-deploy + +clean: + - cd vendor/llama.cpp && make clean + - cd vendor/llama.cpp && rm libllama.so + - rm -rf _skbuild + - rm llama_cpp/libllama.so + +.PHONY: \ + update \ + update.vendor \ + build \ + build.cuda \ + build.opencl \ + build.openblas \ + build.sdist \ + deploy.pypi \ + deploy.gh-docs \ + clean \ No newline at end of file From fe331ec58914feaacfa3052957fef53bbd005997 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 20:03:31 -0400 Subject: [PATCH 298/443] Replace eval_logits and eval_tokens with numpy arrays --- llama_cpp/llama.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6babebd..4f10227 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -299,6 +299,8 @@ class Llama: """Reset the model state.""" self.eval_tokens.clear() self.eval_logits.clear() + self._input_ids = np.array([], dtype=np.intc) + self._scores = np.ndarray((0, self._n_vocab), dtype=np.single) def eval(self, tokens: Sequence[int]): """Evaluate a list of tokens. @@ -310,7 +312,7 @@ class Llama: n_ctx = self._n_ctx for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] - n_past = min(n_ctx - len(batch), len(self.eval_tokens)) + n_past = min(n_ctx - len(batch), len(self._input_ids)) n_tokens = len(batch) return_code = llama_cpp.llama_eval( ctx=self.ctx, @@ -356,6 +358,7 @@ class Llama: ): assert self.ctx is not None assert len(self.eval_logits) > 0 + assert self._scores.shape[0] > 0 n_vocab = self._n_vocab n_ctx = self._n_ctx top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k @@ -368,7 +371,7 @@ class Llama: if logits_processor is not None: logits = np.array( - logits_processor(list(self.eval_tokens), logits.tolist()), + logits_processor(self._input_ids.tolist(), logits.tolist()), dtype=np.single, ) self._scores[-1, :] = logits @@ -498,8 +501,8 @@ class Llama: """ assert self.ctx is not None last_n_tokens_data = [llama_cpp.llama_token(0)] * max( - 0, self.last_n_tokens_size - len(self.eval_tokens) - ) + list(self.eval_tokens)[-self.last_n_tokens_size :] + 0, self.last_n_tokens_size - len(self._input_ids) + ) + self._input_ids[-self.last_n_tokens_size :].tolist() return self._sample( last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)( *last_n_tokens_data @@ -557,9 +560,9 @@ class Llama: """ assert self.ctx is not None - if reset and len(self.eval_tokens) > 0: + if reset and len(self._input_ids) > 0: longest_prefix = 0 - for a, b in zip(self.eval_tokens, tokens[:-1]): + for a, b in zip(self._input_ids, tokens[:-1]): if a == b: longest_prefix += 1 else: @@ -569,6 +572,8 @@ class Llama: print("Llama.generate: prefix-match hit", file=sys.stderr) reset = False tokens = tokens[longest_prefix:] + self._input_ids = self._input_ids[:longest_prefix] + self._scores = self._scores[:longest_prefix, :] for _ in range(len(self.eval_tokens) - longest_prefix): self.eval_tokens.pop() try: @@ -595,7 +600,7 @@ class Llama: logits_processor=logits_processor, ) if stopping_criteria is not None and stopping_criteria( - list(self.eval_tokens), self.eval_logits[-1] + self._input_ids.tolist(), self._scores[-1, :].tolist() ): return tokens_or_none = yield token @@ -820,7 +825,7 @@ class Llama: self.detokenize(completion_tokens[:returned_tokens]) ) token_offset = len(prompt_tokens) + returned_tokens - logits = self.eval_logits[token_offset - 1] + logits = self._scores[token_offset - 1, :].tolist() current_logprobs = Llama.logits_to_logprobs(logits) sorted_logprobs = list( sorted( @@ -869,7 +874,7 @@ class Llama: break if stopping_criteria is not None and stopping_criteria( - list(self.eval_tokens), self.eval_logits[-1] + self._input_ids.tolist(), self._scores[-1, :].tolist() ): text = self.detokenize(completion_tokens) finish_reason = "stop" @@ -899,7 +904,7 @@ class Llama: self.detokenize(completion_tokens[:returned_tokens]) ) token_offset = len(prompt_tokens) + returned_tokens - 1 - logits = self.eval_logits[token_offset] + logits = self._scores[token_offset, :].tolist() current_logprobs = Llama.logits_to_logprobs(logits) sorted_logprobs = list( sorted( @@ -1001,8 +1006,7 @@ class Llama: for token in all_tokens ] all_logprobs = [ - Llama.logits_to_logprobs(list(map(float, row))) - for row in self.eval_logits + Llama.logits_to_logprobs(row.tolist()) for row in self._scores ][token_offset:] for token, token_str, logprobs_token in zip( all_tokens, all_token_strs, all_logprobs From 7fc7bc30e712c10d633a7acf912134ae92c0fbe3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 20:12:05 -0400 Subject: [PATCH 299/443] Remove usage of eval_tokens for cache check --- llama_cpp/llama.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4f10227..064b982 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -735,10 +735,10 @@ class Llama: try: cache_item = self.cache[prompt_tokens] cache_prefix_len = Llama.longest_token_prefix( - cache_item.eval_tokens, prompt_tokens + cache_item.input_ids.tolist(), prompt_tokens ) eval_prefix_len = Llama.longest_token_prefix( - self.eval_tokens, prompt_tokens + self._input_ids.tolist(), prompt_tokens ) if cache_prefix_len > eval_prefix_len: self.load_state(cache_item) From 8f35bddd7eab82b2609c61d76e74a4509b57c26b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 20:23:49 -0400 Subject: [PATCH 300/443] Fix stop sequence performance bug. --- CHANGELOG.md | 6 +++++- llama_cpp/llama.py | 10 ++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6eb04cd..8b5fbec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,4 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Added first version of the changelog \ No newline at end of file +- Added first version of the changelog + +### Fixed + +- Performance bug in stop sequence check slowing down streaming. \ No newline at end of file diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 012bb86..d7dc625 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -775,20 +775,22 @@ class Llama: break if stream: + remaining_tokens = completion_tokens[returned_tokens:] + remaining_text = self.detokenize(remaining_tokens) + remaining_length = len(remaining_text) + # We want to avoid yielding any characters from # the generated text if they are part of a stop # sequence. first_stop_position = 0 for s in stop_sequences: - for i in range(len(s), 0, -1): - if all_text.endswith(s[:i]): + for i in range(min(len(s), remaining_length), 0, -1): + if remaining_text.endswith(s[:i]): if i > first_stop_position: first_stop_position = i break token_end_position = 0 - remaining_tokens = completion_tokens[returned_tokens:] - remaining_length = len(self.detokenize(remaining_tokens)) for token in remaining_tokens: token_end_position += len(self.detokenize([token])) # Check if stop sequence is in the token From b0b154cfa6d22d317ad26f974f9916f79bbc78c2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 20:26:08 -0400 Subject: [PATCH 301/443] Add changelog message for numpy --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b5fbec..ccb1c7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added first version of the changelog +- Use numpy for internal buffers to reduce memory usage and improve performance. ### Fixed From 84e313bd6e18e341f35be6c87e7151e7ce8d926d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 22:02:16 -0400 Subject: [PATCH 302/443] Align dtype to match c structs --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 3084b33..ac51ce5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -217,7 +217,7 @@ class Llama: size = llama_cpp.c_size_t(self._n_vocab) sorted = llama_cpp.c_bool(False) self._candidates_data = np.array( - [], dtype=[("id", np.intc), ("logit", np.single), ("p", np.single)] + [], dtype=np.dtype([("id", np.intc), ("logit", np.single), ("p", np.single)], align=True) ) self._candidates_data.resize(3, self._n_vocab) candidates = llama_cpp.llama_token_data_array( From 8f2b4456ad5b7a80be9264fa94927e8a79ed16a9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 22:04:31 -0400 Subject: [PATCH 303/443] Format --- llama_cpp/llama.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ac51ce5..18372c8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -217,7 +217,10 @@ class Llama: size = llama_cpp.c_size_t(self._n_vocab) sorted = llama_cpp.c_bool(False) self._candidates_data = np.array( - [], dtype=np.dtype([("id", np.intc), ("logit", np.single), ("p", np.single)], align=True) + [], + dtype=np.dtype( + [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True + ), ) self._candidates_data.resize(3, self._n_vocab) candidates = llama_cpp.llama_token_data_array( From c2b59a5f59722d501ed21536620e24c36651f3f9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 22:59:29 -0400 Subject: [PATCH 304/443] Import unnused import --- llama_cpp/server/app.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index fea3612..882c902 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,5 +1,4 @@ import json -import logging import multiprocessing from threading import Lock from typing import List, Optional, Union, Iterator, Dict From 80066f0b802f0019395466ac090c10dcd78c97bb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 27 May 2023 09:12:58 -0400 Subject: [PATCH 305/443] Use async routes --- llama_cpp/server/app.py | 141 ++++++++++++++++++++++++---------------- 1 file changed, 86 insertions(+), 55 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 882c902..ea9dec4 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,12 +1,16 @@ import json import multiprocessing from threading import Lock -from typing import List, Optional, Union, Iterator, Dict +from functools import partial +from typing import Iterator, List, Optional, Union, Dict from typing_extensions import TypedDict, Literal import llama_cpp -from fastapi import Depends, FastAPI, APIRouter +import anyio +from anyio.streams.memory import MemoryObjectSendStream +from starlette.concurrency import run_in_threadpool, iterate_in_threadpool +from fastapi import Depends, FastAPI, APIRouter, Request from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict from sse_starlette.sse import EventSourceResponse @@ -241,35 +245,49 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) "/v1/completions", response_model=CreateCompletionResponse, ) -def create_completion( - request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) +async def create_completion( + request: Request, + body: CreateCompletionRequest, + llama: llama_cpp.Llama = Depends(get_llama), ): - if isinstance(request.prompt, list): - assert len(request.prompt) <= 1 - request.prompt = request.prompt[0] if len(request.prompt) > 0 else "" + if isinstance(body.prompt, list): + assert len(body.prompt) <= 1 + body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" - completion_or_chunks = llama( - **request.dict( - exclude={ - "n", - "best_of", - "logit_bias", - "user", - } + exclude = { + "n", + "best_of", + "logit_bias", + "user", + } + kwargs = body.dict(exclude=exclude) + if body.stream: + send_chan, recv_chan = anyio.create_memory_object_stream(10) + + async def event_publisher(inner_send_chan: MemoryObjectSendStream): + async with inner_send_chan: + try: + iterator: Iterator[llama_cpp.CompletionChunk] = await run_in_threadpool(llama, **kwargs) # type: ignore + async for chunk in iterate_in_threadpool(iterator): + await inner_send_chan.send(dict(data=json.dumps(chunk))) + if await request.is_disconnected(): + raise anyio.get_cancelled_exc_class()() + await inner_send_chan.send(dict(data="[DONE]")) + except anyio.get_cancelled_exc_class() as e: + print("disconnected") + with anyio.move_on_after(1, shield=True): + print( + f"Disconnected from client (via refresh/close) {request.client}" + ) + await inner_send_chan.send(dict(closing=True)) + raise e + + return EventSourceResponse( + recv_chan, data_sender_callable=partial(event_publisher, send_chan) ) - ) - if request.stream: - - async def server_sent_events( - chunks: Iterator[llama_cpp.CompletionChunk], - ): - for chunk in chunks: - yield dict(data=json.dumps(chunk)) - - chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks # type: ignore - return EventSourceResponse(server_sent_events(chunks)) - completion: llama_cpp.Completion = completion_or_chunks # type: ignore - return completion + else: + completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore + return completion class CreateEmbeddingRequest(BaseModel): @@ -292,10 +310,12 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) "/v1/embeddings", response_model=CreateEmbeddingResponse, ) -def create_embedding( +async def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - return llama.create_embedding(**request.dict(exclude={"user"})) + return await run_in_threadpool( + llama.create_embedding, **request.dict(exclude={"user"}) + ) class ChatCompletionRequestMessage(BaseModel): @@ -349,36 +369,47 @@ CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatComplet "/v1/chat/completions", response_model=CreateChatCompletionResponse, ) -def create_chat_completion( - request: CreateChatCompletionRequest, +async def create_chat_completion( + request: Request, + body: CreateChatCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: - completion_or_chunks = llama.create_chat_completion( - **request.dict( - exclude={ - "n", - "logit_bias", - "user", - } - ), - ) + exclude = { + "n", + "logit_bias", + "user", + } + kwargs = body.dict(exclude=exclude) + if body.stream: + send_chan, recv_chan = anyio.create_memory_object_stream(10) - if request.stream: - - async def server_sent_events( - chat_chunks: Iterator[llama_cpp.ChatCompletionChunk], - ): - for chat_chunk in chat_chunks: - yield dict(data=json.dumps(chat_chunk)) - yield dict(data="[DONE]") - - chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore + async def event_publisher(inner_send_chan: MemoryObjectSendStream): + async with inner_send_chan: + try: + iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs) # type: ignore + async for chat_chunk in iterate_in_threadpool(iterator): + await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) + if await request.is_disconnected(): + raise anyio.get_cancelled_exc_class()() + await inner_send_chan.send(dict(data="[DONE]")) + except anyio.get_cancelled_exc_class() as e: + print("disconnected") + with anyio.move_on_after(1, shield=True): + print( + f"Disconnected from client (via refresh/close) {request.client}" + ) + await inner_send_chan.send(dict(closing=True)) + raise e return EventSourceResponse( - server_sent_events(chunks), + recv_chan, + data_sender_callable=partial(event_publisher, send_chan), ) - completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore - return completion + else: + completion: llama_cpp.ChatCompletion = await run_in_threadpool( + llama.create_chat_completion, **kwargs # type: ignore + ) + return completion class ModelData(TypedDict): @@ -397,7 +428,7 @@ GetModelResponse = create_model_from_typeddict(ModelList) @router.get("/v1/models", response_model=GetModelResponse) -def get_models( +async def get_models( settings: Settings = Depends(get_settings), llama: llama_cpp.Llama = Depends(get_llama), ) -> ModelList: From 719c3eae0a5c1b3a866f4be540e0140c63dd6c4b Mon Sep 17 00:00:00 2001 From: Maximilian-Winter Date: Sun, 28 May 2023 15:56:38 +0200 Subject: [PATCH 306/443] Diskcache implementation for llama state. --- llama_cpp/llama.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index d7dc625..447acb7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -17,20 +17,23 @@ from typing import ( ) from collections import deque, OrderedDict +import diskcache + from . import llama_cpp from .llama_types import * + class LlamaCache: """Cache for a llama.cpp model.""" - def __init__(self, capacity_bytes: int = (2 << 30)): - self.cache_state: OrderedDict[Tuple[int, ...], "LlamaState"] = OrderedDict() + def __init__(self, cache_dir="./llama_cache", capacity_bytes: int = (2 << 30)): + self.cache = diskcache.Cache(cache_dir) self.capacity_bytes = capacity_bytes @property def cache_size(self): - return sum([state.llama_state_size for state in self.cache_state.values()]) + return self.cache.volume() def _find_longest_prefix_key( self, @@ -38,10 +41,8 @@ class LlamaCache: ) -> Optional[Tuple[int, ...]]: min_len = 0 min_key = None - keys = ( - (k, Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() - ) - for k, prefix_len in keys: + for k in self.cache.iterkeys(): + prefix_len = Llama.longest_token_prefix(k, key) if prefix_len > min_len: min_len = prefix_len min_key = k @@ -51,9 +52,9 @@ class LlamaCache: key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: - raise KeyError(f"Key not found") - value = self.cache_state[_key] - self.cache_state.move_to_end(_key) + raise KeyError("Key not found") + value = self.cache.pop(_key) + self.cache.push(_key) return value def __contains__(self, key: Sequence[int]) -> bool: @@ -61,11 +62,13 @@ class LlamaCache: def __setitem__(self, key: Sequence[int], value: "LlamaState"): key = tuple(key) - if key in self.cache_state: - del self.cache_state[key] - self.cache_state[key] = value + if key in self.cache: + del self.cache[key] + self.cache[key] = value while self.cache_size > self.capacity_bytes: - self.cache_state.popitem(last=False) + key_to_remove = next(iter(self.cache)) + del self.cache[key_to_remove] + class LlamaState: From 62ac7c3761518d718343866e87abfb4f0ae6b9bd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 May 2023 21:03:33 +0000 Subject: [PATCH 307/443] Bump mkdocstrings from 0.21.2 to 0.22.0 Bumps [mkdocstrings](https://github.com/mkdocstrings/mkdocstrings) from 0.21.2 to 0.22.0. - [Release notes](https://github.com/mkdocstrings/mkdocstrings/releases) - [Changelog](https://github.com/mkdocstrings/mkdocstrings/blob/master/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/mkdocstrings/compare/0.21.2...0.22.0) --- updated-dependencies: - dependency-name: mkdocstrings dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 13 +++++++------ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index 50ae0cb..733aa33 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "anyio" @@ -835,17 +835,18 @@ files = [ [[package]] name = "mkdocstrings" -version = "0.21.2" +version = "0.22.0" description = "Automatic documentation from sources, for MkDocs." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocstrings-0.21.2-py3-none-any.whl", hash = "sha256:949ef8da92df9d692ca07be50616459a6b536083a25520fd54b00e8814ce019b"}, - {file = "mkdocstrings-0.21.2.tar.gz", hash = "sha256:304e56a2e90595708a38a13a278e538a67ad82052dd5c8b71f77a604a4f3d911"}, + {file = "mkdocstrings-0.22.0-py3-none-any.whl", hash = "sha256:2d4095d461554ff6a778fdabdca3c00c468c2f1459d469f7a7f622a2b23212ba"}, + {file = "mkdocstrings-0.22.0.tar.gz", hash = "sha256:82a33b94150ebb3d4b5c73bab4598c3e21468c79ec072eff6931c8f3bfc38256"}, ] [package.dependencies] +importlib-metadata = {version = ">=4.6", markers = "python_version < \"3.10\""} Jinja2 = ">=2.11.1" Markdown = ">=3.3" MarkupSafe = ">=1.1" @@ -1653,9 +1654,9 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [extras] -server = ["fastapi", "sse-starlette", "uvicorn"] +server = ["uvicorn", "fastapi", "sse-starlette"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "b1b158e4c9640e4dc197fe43e22c9f87e6e90945ec9b8bcba6042f81249d251e" +content-hash = "d372864238c465628bc679cbeeedd2da04ea8e33382ba5a1cc8d76b3481fcb1a" diff --git a/pyproject.toml b/pyproject.toml index aacdac0..419a971 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ sse-starlette = { version = "^1.3.3", optional = true } black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" -mkdocstrings = {extras = ["python"], version = "^0.21.2"} +mkdocstrings = {extras = ["python"], version = "^0.22.0"} mkdocs-material = "^9.1.14" pytest = "^7.3.1" httpx = "^0.24.1" From fa79484a294c789c4fba107e6aa5210174235c9a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 May 2023 00:29:33 +0000 Subject: [PATCH 308/443] Bump scikit-build from 0.13.0 to 0.17.5 Bumps [scikit-build](https://github.com/scikit-build/scikit-build) from 0.13.0 to 0.17.5. - [Release notes](https://github.com/scikit-build/scikit-build/releases) - [Changelog](https://github.com/scikit-build/scikit-build/blob/main/CHANGES.rst) - [Commits](https://github.com/scikit-build/scikit-build/compare/0.13.0...0.17.5) --- updated-dependencies: - dependency-name: scikit-build dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 19 +++++++++++-------- pyproject.toml | 2 +- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/poetry.lock b/poetry.lock index 733aa33..16b92f7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1375,25 +1375,28 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "scikit-build" -version = "0.13.0" +version = "0.17.5" description = "Improved build system generator for Python C/C++/Fortran/Cython extensions" category = "dev" optional = false -python-versions = "*" +python-versions = ">=3.7" files = [ - {file = "scikit-build-0.13.0.tar.gz", hash = "sha256:a6ca1b7f1cc8a718564c19f535014f3a71f34508f72e750d4221f987eed0f06d"}, - {file = "scikit_build-0.13.0-py2.py3-none-any.whl", hash = "sha256:f903fef5cd76aa81dee040fa9cf3daaeff5c71fccfe5fc0bf6a62e54b166d492"}, + {file = "scikit_build-0.17.5-py3-none-any.whl", hash = "sha256:18861286b34fd2d685327d3bec6ebf4d33303adfaef28a08dd856710d16cf20f"}, + {file = "scikit_build-0.17.5.tar.gz", hash = "sha256:76856e7631d9e8887a7aa71913d5f184a6177246225391af96ce4801d89fa254"}, ] [package.dependencies] distro = "*" packaging = "*" -setuptools = {version = ">=28.0.0", markers = "python_version >= \"3\""} -wheel = ">=0.29.0" +setuptools = ">=42.0.0" +tomli = {version = "*", markers = "python_version < \"3.11\""} +wheel = ">=0.32.0" [package.extras] +cov = ["coverage[toml] (>=4.2)", "pytest-cov (>=2.7.1)"] docs = ["pygments", "sphinx (>=4)", "sphinx-issues", "sphinx-rtd-theme (>=1.0)", "sphinxcontrib-moderncmakedomain (>=3.19)"] -test = ["build (>=0.5)", "codecov (>=2.0.5)", "coverage (>=4.2)", "cython (>=0.25.1)", "flake8 (>=3.0.4)", "path.py (>=11.5.0)", "pathlib2", "pytest (>=4.5.0)", "pytest-cov (>=2.7.1)", "pytest-mock (>=1.10.4)", "pytest-runner (>=5.1)", "pytest-virtualenv (>=1.2.5)", "requests", "six (>=1.10.0)", "ubelt (>=0.8.2)", "virtualenv", "xdoctest (>=0.10.0)"] +doctest = ["ubelt (>=0.8.2)", "xdoctest (>=0.10.0)"] +test = ["build (>=0.7)", "cython (>=0.25.1)", "importlib-metadata", "pytest (>=6.0.0)", "pytest-mock (>=1.10.4)", "pytest-virtualenv (>=1.2.5)", "requests", "virtualenv"] [[package]] name = "secretstorage" @@ -1659,4 +1662,4 @@ server = ["uvicorn", "fastapi", "sse-starlette"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "d372864238c465628bc679cbeeedd2da04ea8e33382ba5a1cc8d76b3481fcb1a" +content-hash = "af969208807cf8dd49c51acdb309ea14019a0cd967a21c45b92e8af9f922eb3c" diff --git a/pyproject.toml b/pyproject.toml index 419a971..a8a8139 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ mkdocstrings = {extras = ["python"], version = "^0.22.0"} mkdocs-material = "^9.1.14" pytest = "^7.3.1" httpx = "^0.24.1" -scikit-build = "0.13" +scikit-build = "0.17.5" [tool.poetry.extras] server = ["uvicorn", "fastapi", "sse-starlette"] From 38b918503f0ab53036d518a57f232581856a2d02 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 May 2023 00:40:52 +0000 Subject: [PATCH 309/443] Bump mkdocs-material from 9.1.14 to 9.1.15 Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.14 to 9.1.15. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.14...9.1.15) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 16b92f7..833935b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -800,14 +800,14 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.14" +version = "9.1.15" description = "Documentation that simply works" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.14-py3-none-any.whl", hash = "sha256:b56a9f955ed32d38333715cbbf68ce38f683bf38610c65094fa4ef2db9f08bcd"}, - {file = "mkdocs_material-9.1.14.tar.gz", hash = "sha256:1ae74cc5464ef2f64574d4884512efed7f4db386fb9bc6af20fd427d7a702f49"}, + {file = "mkdocs_material-9.1.15-py3-none-any.whl", hash = "sha256:b49e12869ab464558e2dd3c5792da5b748a7e0c48ee83b4d05715f98125a7a39"}, + {file = "mkdocs_material-9.1.15.tar.gz", hash = "sha256:8513ab847c9a541ed3d11a3a7eed556caf72991ee786c31c5aac6691a121088a"}, ] [package.dependencies] @@ -1662,4 +1662,4 @@ server = ["uvicorn", "fastapi", "sse-starlette"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "af969208807cf8dd49c51acdb309ea14019a0cd967a21c45b92e8af9f922eb3c" +content-hash = "3835d3727fcf88b9a9cbba2e376980cd32252d351f3dab279de1bf615ba28160" diff --git a/pyproject.toml b/pyproject.toml index a8a8139..52ad34b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.22.0"} -mkdocs-material = "^9.1.14" +mkdocs-material = "^9.1.15" pytest = "^7.3.1" httpx = "^0.24.1" scikit-build = "0.17.5" From 8dfb0816dfa33df71d58e5e1749beef49f27de88 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 May 2023 01:04:25 +0000 Subject: [PATCH 310/443] Bump uvicorn from 0.21.1 to 0.22.0 Bumps [uvicorn](https://github.com/encode/uvicorn) from 0.21.1 to 0.22.0. - [Release notes](https://github.com/encode/uvicorn/releases) - [Changelog](https://github.com/encode/uvicorn/blob/master/CHANGELOG.md) - [Commits](https://github.com/encode/uvicorn/compare/0.21.1...0.22.0) --- updated-dependencies: - dependency-name: uvicorn dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 833935b..945ef5a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1556,14 +1556,14 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "uvicorn" -version = "0.21.1" +version = "0.22.0" description = "The lightning-fast ASGI server." category = "main" optional = true python-versions = ">=3.7" files = [ - {file = "uvicorn-0.21.1-py3-none-any.whl", hash = "sha256:e47cac98a6da10cd41e6fd036d472c6f58ede6c5dbee3dbee3ef7a100ed97742"}, - {file = "uvicorn-0.21.1.tar.gz", hash = "sha256:0fac9cb342ba099e0d582966005f3fdba5b0290579fed4a6266dc702ca7bb032"}, + {file = "uvicorn-0.22.0-py3-none-any.whl", hash = "sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996"}, + {file = "uvicorn-0.22.0.tar.gz", hash = "sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8"}, ] [package.dependencies] @@ -1662,4 +1662,4 @@ server = ["uvicorn", "fastapi", "sse-starlette"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "3835d3727fcf88b9a9cbba2e376980cd32252d351f3dab279de1bf615ba28160" +content-hash = "5a89d0ed28ac6e795e43b7b06f2b99d198ab56a6d0ab05d47768b84ea8a0337a" diff --git a/pyproject.toml b/pyproject.toml index 52ad34b..9ea9116 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ include = [ [tool.poetry.dependencies] python = "^3.8.1" typing-extensions = "^4.5.0" -uvicorn = { version = "^0.21.1", optional = true } +uvicorn = { version = "^0.22.0", optional = true } fastapi = { version = "^0.95.0", optional = true } sse-starlette = { version = "^1.3.3", optional = true } From f4fc126a00874c756c846ce6fbf13704b60fd0b2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 May 2023 01:09:10 +0000 Subject: [PATCH 311/443] Bump typing-extensions from 4.5.0 to 4.6.2 Bumps [typing-extensions](https://github.com/python/typing_extensions) from 4.5.0 to 4.6.2. - [Changelog](https://github.com/python/typing_extensions/blob/main/CHANGELOG.md) - [Commits](https://github.com/python/typing_extensions/compare/4.5.0...4.6.2) --- updated-dependencies: - dependency-name: typing-extensions dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 945ef5a..70e4272 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1526,14 +1526,14 @@ urllib3 = ">=1.26.0" [[package]] name = "typing-extensions" -version = "4.5.0" +version = "4.6.2" description = "Backported and Experimental Type Hints for Python 3.7+" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"}, - {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"}, + {file = "typing_extensions-4.6.2-py3-none-any.whl", hash = "sha256:3a8b36f13dd5fdc5d1b16fe317f5668545de77fa0b8e02006381fd49d731ab98"}, + {file = "typing_extensions-4.6.2.tar.gz", hash = "sha256:06006244c70ac8ee83fa8282cb188f697b8db25bc8b4df07be1873c43897060c"}, ] [[package]] @@ -1662,4 +1662,4 @@ server = ["uvicorn", "fastapi", "sse-starlette"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "5a89d0ed28ac6e795e43b7b06f2b99d198ab56a6d0ab05d47768b84ea8a0337a" +content-hash = "f5aacb68729427e49bb796a598890fedd8ba1950af3fd577fb85edde2c27338f" diff --git a/pyproject.toml b/pyproject.toml index 9ea9116..39b731e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ include = [ [tool.poetry.dependencies] python = "^3.8.1" -typing-extensions = "^4.5.0" +typing-extensions = "^4.6.2" uvicorn = { version = "^0.22.0", optional = true } fastapi = { version = "^0.95.0", optional = true } sse-starlette = { version = "^1.3.3", optional = true } From b1daf568e390e11da6206737d4a1a8d92bb4568b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 29 May 2023 21:39:19 -0400 Subject: [PATCH 312/443] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b5fbec..1f6dac7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added first version of the changelog +- Server: Use async routes ### Fixed From f4ff8a03c4bb43ca0e0ca51e6dcdbc24f0fb13dd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 May 2023 03:06:57 -0400 Subject: [PATCH 313/443] Add numpy dependency to pyproject --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 39b731e..f75b802 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ include = [ [tool.poetry.dependencies] python = "^3.8.1" typing-extensions = "^4.6.2" +numpy = "^1.20.0" uvicorn = { version = "^0.22.0", optional = true } fastapi = { version = "^0.95.0", optional = true } sse-starlette = { version = "^1.3.3", optional = true } From cb0bcdbbb7bbcb4182cbb8106b8e183c9da70481 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 May 2023 03:07:36 -0400 Subject: [PATCH 314/443] Bump version --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb5f443..d9f52da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [v0.1.56] + ### Added - Added first version of the changelog diff --git a/pyproject.toml b/pyproject.toml index f75b802..9f83e19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.55" +version = "0.1.56" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index a1a2c5b..39e1416 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.55", + version="0.1.56", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 9dd8cf34726b839222af45684701ee63bb6cd535 Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Tue, 30 May 2023 08:20:34 +0100 Subject: [PATCH 315/443] Update bug_report.md - Added section on how to repro using llama.cpp in ./vendor/llama.cpp - Added a few more example environment commands to aid in debugging. --- .github/ISSUE_TEMPLATE/bug_report.md | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index b8e33e5..5df12aa 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -57,7 +57,17 @@ Please provide detailed steps for reproducing the issue. We are not sitting in f 3. step 3 4. etc. -**Note: Many issues seem to be regarding performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.** +**Note: Many issues seem to be regarding functional or performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.** + +Try the following: + +1. `git clone https://github.com/abetlen/llama-cpp-python` +2. `cd llama-cpp-python` +3. `rm -rf _skbuild/` # delete any old builds +4. `python setup.py develop` +5. `cd ./vendor/llama.cpp` +6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp +7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues) # Failure Logs @@ -73,8 +83,14 @@ commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2 llama-cpp-python$ python3 --version Python 3.10.10 -llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette" -fastapi 0.95.0 -sse-starlette 1.3.3 -uvicorn 0.21.1 +llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy" +fastapi 0.95.0 +numpy 1.24.3 +sse-starlette 1.3.3 +uvicorn 0.21.1 + +llama-cpp-python/vendor/llama.cpp$ git log | head -3 +commit 66874d4fbcc7866377246efbcee938e8cc9c7d76 +Author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> +Date: Thu May 25 20:18:01 2023 -0600 ``` From 483b6ba53af349050458d3223e41aa71829f1391 Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Wed, 31 May 2023 15:16:32 +0000 Subject: [PATCH 316/443] Updated README.md instructions on how to use *_simple/Dockerfiles --- docker/README.md | 21 ++++++++++++++----- docker/{ => auto_docker}/Dockerfile | 0 docker/{ => auto_docker}/hug_model.py | 0 docker/{ => auto_docker}/start_server.sh | 0 .../Dockerfile} | 4 ++-- .../Dockerfile} | 2 +- 6 files changed, 19 insertions(+), 8 deletions(-) rename docker/{ => auto_docker}/Dockerfile (100%) rename docker/{ => auto_docker}/hug_model.py (100%) rename docker/{ => auto_docker}/start_server.sh (100%) rename docker/{Dockerfile.cuda_simple => cuda_simple/Dockerfile} (82%) rename docker/{Dockerfile.openblas_simple => openblas_simple/Dockerfile} (86%) diff --git a/docker/README.md b/docker/README.md index 100bcbd..130d180 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,10 +1,21 @@ -# Dockerfiles for building the llama-cpp-python server -- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS -- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS -- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke) +# Simple Dockerfiles for building the llama-cpp-python server with external model bin files +- `./openblas_simple/Dockerfile` - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image + - `cd ./openblas_simple` + - `docker build -t openblas_simple .` + - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t openblas_simple` + where `/` is the full path to the model file on the Docker host system. +- `./cuda_simple/Dockerfile` - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image + - `cd ./cuda_simple` + - `docker build -t cuda_simple .` + - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple` + where `/` is the full path to the model file on the Docker host system. + +# "Bot-in-a-box" - a method to build a Docker image by choosing a model to be downloaded and loading into a Docker image + - `cd ./auto_docker`: + - `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke) - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` -# Get model from Hugging Face +## Get model from Hugging Face `python3 ./hug_model.py` You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. diff --git a/docker/Dockerfile b/docker/auto_docker/Dockerfile similarity index 100% rename from docker/Dockerfile rename to docker/auto_docker/Dockerfile diff --git a/docker/hug_model.py b/docker/auto_docker/hug_model.py similarity index 100% rename from docker/hug_model.py rename to docker/auto_docker/hug_model.py diff --git a/docker/start_server.sh b/docker/auto_docker/start_server.sh similarity index 100% rename from docker/start_server.sh rename to docker/auto_docker/start_server.sh diff --git a/docker/Dockerfile.cuda_simple b/docker/cuda_simple/Dockerfile similarity index 82% rename from docker/Dockerfile.cuda_simple rename to docker/cuda_simple/Dockerfile index dda7a9f..24906d5 100644 --- a/docker/Dockerfile.cuda_simple +++ b/docker/cuda_simple/Dockerfile @@ -1,5 +1,5 @@ ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04" -FROM ${CUDA_IMAGE} +FROM nvidia/cuda:${CUDA_IMAGE} # We need to set the host to 0.0.0.0 to allow outside access ENV HOST 0.0.0.0 @@ -10,7 +10,7 @@ COPY . . RUN apt update && apt install -y python3 python3-pip RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette -RUN LLAMA_CUBLAS=1 python3 setup.py develop +RUN LLAMA_CUBLAS=1 pip install llama-cpp-python # Run the server CMD python3 -m llama_cpp.server diff --git a/docker/Dockerfile.openblas_simple b/docker/openblas_simple/Dockerfile similarity index 86% rename from docker/Dockerfile.openblas_simple rename to docker/openblas_simple/Dockerfile index f58506f..1a95cae 100644 --- a/docker/Dockerfile.openblas_simple +++ b/docker/openblas_simple/Dockerfile @@ -9,7 +9,7 @@ COPY . . RUN apt update && apt install -y libopenblas-dev ninja-build build-essential RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette -RUN LLAMA_OPENBLAS=1 python3 setup.py develop +RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose # Run the server CMD python3 -m llama_cpp.server From 217d78320fb6096e0696182816df0bf3ae5b961a Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Wed, 31 May 2023 16:00:31 +0000 Subject: [PATCH 317/443] Added paramterised search and d/l for Hugging Face. Updated README.md --- .gitignore | 3 +++ docker/README.md | 41 +++++++++++++++++---------------- docker/auto_docker/hug_model.py | 30 ++++++++++++++++++------ 3 files changed, 47 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index fd64c09..8db9bcb 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ + +# model .bin files +docker/auto_docker/*.bin diff --git a/docker/README.md b/docker/README.md index 130d180..e61095f 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,3 +1,11 @@ +# Install Docker Server + +**Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR! + +[Install Docker Engine](https://docs.docker.com/engine/install) + +**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) + # Simple Dockerfiles for building the llama-cpp-python server with external model bin files - `./openblas_simple/Dockerfile` - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image - `cd ./openblas_simple` @@ -15,14 +23,14 @@ - `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke) - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` -## Get model from Hugging Face -`python3 ./hug_model.py` - -You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. +## Download a Llama Model from Hugging Face +- To download a MIT licensed Llama model run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml` +- To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama` +- You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. ``` docker $ ls -lh *.bin --rw-rw-r-- 1 user user 4.8G May 23 18:30 .q5_1.bin -lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> .q5_1.bin +-rw-rw-r-- 1 user user 4.8G May 23 18:30 q5_1.bin +lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> q5_1.bin ``` **Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least **TWICE** as much disk space as the size of the model: @@ -36,22 +44,15 @@ lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> .q5 **Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` -# Install Docker Server - -**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR! - -[Install Docker Engine](https://docs.docker.com/engine/install) - -# Use OpenBLAS +## Use OpenBLAS Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS: -## Build: -`docker build --build-arg -t openblas .` -## Run: +### Build: +`docker build -t openblas .` +### Run: `docker run --cap-add SYS_RESOURCE -t openblas` -# Use CuBLAS -Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) -## Build: +## Use CuBLAS +### Build: `docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .` -## Run: +### Run: `docker run --cap-add SYS_RESOURCE -t cublas` diff --git a/docker/auto_docker/hug_model.py b/docker/auto_docker/hug_model.py index 848a1aa..86a8214 100644 --- a/docker/auto_docker/hug_model.py +++ b/docker/auto_docker/hug_model.py @@ -2,6 +2,7 @@ import requests import json import os import struct +import argparse def make_request(url, params=None): print(f"Making request to {url}...") @@ -69,21 +70,28 @@ def get_user_choice(model_list): return None -import argparse - def main(): # Create an argument parser - parser = argparse.ArgumentParser(description='Process the model version.') + parser = argparse.ArgumentParser(description='Process some parameters.') + + # Arguments parser.add_argument('-v', '--version', type=int, default=0x0003, help='an integer for the version to be used') + parser.add_argument('-a', '--author', type=str, default='TheBloke', + help='an author to be filtered') + parser.add_argument('-t', '--tags', type=str, default='llama', + help='tags for the content') + parser.add_argument('-s', '--search', type=str, default='', + help='search term') # Parse the arguments args = parser.parse_args() # Define the parameters params = { - "author": "TheBloke", # Filter by author - "tags": "llama" + "author": args.author, + "tags": args.tags, + "search": args.search } models = make_request('https://huggingface.co/api/models', params=params) @@ -103,14 +111,22 @@ def main(): if rfilename and 'q5_1' in rfilename: model_list.append((model_id, rfilename)) - model_choice = get_user_choice(model_list) + # Choose the model + if len(model_list) == 1: + model_choice = model_list[0] + else: + model_choice = get_user_choice(model_list) + if model_choice is not None: model_id, rfilename = model_choice url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" download_file(url, rfilename) _, version = check_magic_and_version(rfilename) if version != args.version: - print(f"Warning: Expected version {args.version}, but found different version in the file.") + print(f"Warning: Expected version {args.version}, but found different version in the file.") + else: + print("Error - model choice was None") + exit(1) if __name__ == '__main__': main() From 29f9c9cca3ba56306cec0ccdef820c828e35d237 Mon Sep 17 00:00:00 2001 From: Maximilian-Winter Date: Wed, 31 May 2023 22:33:56 +0200 Subject: [PATCH 318/443] Added both LlamaChache classes Disk and RAM. --- llama_cpp/llama.py | 456 ++++++++++++++++++++++++++------------------- 1 file changed, 263 insertions(+), 193 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index f6017a1..6ac7f1d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -4,6 +4,7 @@ import uuid import time import math import multiprocessing +from abc import ABC from typing import ( List, Optional, @@ -26,21 +27,94 @@ import numpy as np import numpy.typing as npt +class LlamaCache(ABC): + """Base cache class for a llama.cpp model.""" -class LlamaCache: - """Cache for a llama.cpp model.""" + def __init__(self, capacity_bytes: int = (2 << 30)): + pass + + @property + def cache_size(self): + return 0 + + def _find_longest_prefix_key( + self, + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: + pass + + def __getitem__(self, key: Sequence[int]) -> "LlamaState": + pass + + def __contains__(self, key: Sequence[int]) -> bool: + pass + + def __setitem__(self, key: Sequence[int], value: "LlamaState"): + pass + + +class LlamaRAMCache(LlamaCache): + """Cache for a llama.cpp model using RAM.""" + + def __init__(self, capacity_bytes: int = (2 << 30)): + super().__init__(capacity_bytes) + self.capacity_bytes = capacity_bytes + self.cache_state: OrderedDict[Tuple[int, ...], "LlamaState"] = OrderedDict() + + @property + def cache_size(self): + return sum([state.llama_state_size for state in self.cache_state.values()]) + + def _find_longest_prefix_key( + self, + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: + min_len = 0 + min_key = None + keys = ( + (k, Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() + ) + for k, prefix_len in keys: + if prefix_len > min_len: + min_len = prefix_len + min_key = k + return min_key + + def __getitem__(self, key: Sequence[int]) -> "LlamaState": + key = tuple(key) + _key = self._find_longest_prefix_key(key) + if _key is None: + raise KeyError("Key not found") + value = self.cache_state[_key] + self.cache_state.move_to_end(_key) + return value + + def __contains__(self, key: Sequence[int]) -> bool: + return self._find_longest_prefix_key(tuple(key)) is not None + + def __setitem__(self, key: Sequence[int], value: "LlamaState"): + key = tuple(key) + if key in self.cache_state: + del self.cache_state[key] + self.cache_state[key] = value + while self.cache_size > self.capacity_bytes: + self.cache_state.popitem(last=False) + + +class LlamaDiskCache(LlamaCache): + """Cache for a llama.cpp model using disk.""" def __init__(self, cache_dir="./llama_cache", capacity_bytes: int = (2 << 30)): + super().__init__(capacity_bytes) self.cache = diskcache.Cache(cache_dir) - self.capacity_bytes = capacity_bytes @property def cache_size(self): return self.cache.volume() def _find_longest_prefix_key( - self, - key: Tuple[int, ...], + self, + key: Tuple[int, ...], ) -> Optional[Tuple[int, ...]]: min_len = 0 min_key = None @@ -60,9 +134,6 @@ class LlamaCache: self.cache.push(_key) return value - def __contains__(self, key: Sequence[int]) -> bool: - return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "LlamaState"): key = tuple(key) if key in self.cache: @@ -73,16 +144,15 @@ class LlamaCache: del self.cache[key_to_remove] - class LlamaState: def __init__( - self, - eval_tokens: Deque[int], - eval_logits: Deque[List[float]], - input_ids: npt.NDArray[np.intc], - scores: npt.NDArray[np.single], - llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] - llama_state_size: int, + self, + eval_tokens: Deque[int], + eval_logits: Deque[List[float]], + input_ids: npt.NDArray[np.intc], + scores: npt.NDArray[np.single], + llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] + llama_state_size: int, ): self.eval_tokens = eval_tokens self.eval_logits = eval_logits @@ -114,25 +184,25 @@ class Llama: """High-level Python wrapper for a llama.cpp model.""" def __init__( - self, - model_path: str, - # NOTE: These parameters are likely to change in the future. - n_ctx: int = 512, - n_parts: int = -1, - n_gpu_layers: int = 0, - seed: int = 1337, - f16_kv: bool = True, - logits_all: bool = False, - vocab_only: bool = False, - use_mmap: bool = True, - use_mlock: bool = False, - embedding: bool = False, - n_threads: Optional[int] = None, - n_batch: int = 512, - last_n_tokens_size: int = 64, - lora_base: Optional[str] = None, - lora_path: Optional[str] = None, - verbose: bool = True, + self, + model_path: str, + # NOTE: These parameters are likely to change in the future. + n_ctx: int = 512, + n_parts: int = -1, + n_gpu_layers: int = 0, + seed: int = 1337, + f16_kv: bool = True, + logits_all: bool = False, + vocab_only: bool = False, + use_mmap: bool = True, + use_mlock: bool = False, + embedding: bool = False, + n_threads: Optional[int] = None, + n_batch: int = 512, + last_n_tokens_size: int = 64, + lora_base: Optional[str] = None, + lora_path: Optional[str] = None, + verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -201,12 +271,12 @@ class Llama: if self.lora_path: if llama_cpp.llama_apply_lora_from_file( - self.ctx, - llama_cpp.c_char_p(self.lora_path.encode("utf-8")), - llama_cpp.c_char_p(self.lora_base.encode("utf-8")) - if self.lora_base is not None - else llama_cpp.c_char_p(0), - llama_cpp.c_int(self.n_threads), + self.ctx, + llama_cpp.c_char_p(self.lora_path.encode("utf-8")), + llama_cpp.c_char_p(self.lora_base.encode("utf-8")) + if self.lora_base is not None + else llama_cpp.c_char_p(0), + llama_cpp.c_int(self.n_threads), ): raise RuntimeError( f"Failed to apply LoRA from lora path: {self.lora_path} to base path: {self.lora_base}" @@ -317,7 +387,7 @@ class Llama: assert self.ctx is not None n_ctx = self._n_ctx for i in range(0, len(tokens), self.n_batch): - batch = tokens[i : min(len(tokens), i + self.n_batch)] + batch = tokens[i: min(len(tokens), i + self.n_batch)] n_past = min(n_ctx - len(batch), len(self._input_ids)) n_tokens = len(batch) return_code = llama_cpp.llama_eval( @@ -339,28 +409,28 @@ class Llama: n_vocab = self._n_vocab cols = n_vocab logits_view = llama_cpp.llama_get_logits(self.ctx) - logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)] + logits = [logits_view[i * cols: (i + 1) * cols] for i in range(rows)] self.eval_logits.extend(logits) self._scores: npt.NDArray[np.single] = np.concatenate( (self._scores, np.array(logits, dtype=np.single)), axis=0 ) def _sample( - self, - last_n_tokens_data, # type: llama_cpp.Array[llama_cpp.llama_token] - last_n_tokens_size: llama_cpp.c_int, - top_k: llama_cpp.c_int, - top_p: llama_cpp.c_float, - temp: llama_cpp.c_float, - tfs_z: llama_cpp.c_float, - repeat_penalty: llama_cpp.c_float, - frequency_penalty: llama_cpp.c_float, - presence_penalty: llama_cpp.c_float, - mirostat_mode: llama_cpp.c_int, - mirostat_tau: llama_cpp.c_float, - mirostat_eta: llama_cpp.c_float, - penalize_nl: bool = True, - logits_processor: Optional[LogitsProcessorList] = None, + self, + last_n_tokens_data, # type: llama_cpp.Array[llama_cpp.llama_token] + last_n_tokens_size: llama_cpp.c_int, + top_k: llama_cpp.c_int, + top_p: llama_cpp.c_float, + temp: llama_cpp.c_float, + tfs_z: llama_cpp.c_float, + repeat_penalty: llama_cpp.c_float, + frequency_penalty: llama_cpp.c_float, + presence_penalty: llama_cpp.c_float, + mirostat_mode: llama_cpp.c_int, + mirostat_tau: llama_cpp.c_float, + mirostat_eta: llama_cpp.c_float, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, ): assert self.ctx is not None assert len(self.eval_logits) > 0 @@ -480,19 +550,19 @@ class Llama: ) def sample( - self, - top_k: int = 40, - top_p: float = 0.95, - temp: float = 0.80, - repeat_penalty: float = 1.1, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_eta: float = 0.1, - mirostat_tau: float = 5.0, - penalize_nl: bool = True, - logits_processor: Optional[LogitsProcessorList] = None, + self, + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_eta: float = 0.1, + mirostat_tau: float = 5.0, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, ): """Sample a token from the model. @@ -508,7 +578,7 @@ class Llama: assert self.ctx is not None last_n_tokens_data = [llama_cpp.llama_token(0)] * max( 0, self.last_n_tokens_size - len(self._input_ids) - ) + self._input_ids[-self.last_n_tokens_size :].tolist() + ) + self._input_ids[-self.last_n_tokens_size:].tolist() return self._sample( last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)( *last_n_tokens_data @@ -529,21 +599,21 @@ class Llama: ) def generate( - self, - tokens: Sequence[int], - top_k: int = 40, - top_p: float = 0.95, - temp: float = 0.80, - repeat_penalty: float = 1.1, - reset: bool = True, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, + self, + tokens: Sequence[int], + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + reset: bool = True, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -606,7 +676,7 @@ class Llama: logits_processor=logits_processor, ) if stopping_criteria is not None and stopping_criteria( - self._input_ids.tolist(), self._scores[-1, :].tolist() + self._input_ids.tolist(), self._scores[-1, :].tolist() ): return tokens_or_none = yield token @@ -615,7 +685,7 @@ class Llama: tokens.extend(tokens_or_none) def create_embedding( - self, input: Union[str, List[str]], model: Optional[str] = None + self, input: Union[str, List[str]], model: Optional[str] = None ) -> Embedding: """Embed a string. @@ -650,8 +720,8 @@ class Llama: n_tokens = len(tokens) total_tokens += n_tokens embedding = llama_cpp.llama_get_embeddings(self.ctx)[ - : llama_cpp.llama_n_embd(self.ctx) - ] + : llama_cpp.llama_n_embd(self.ctx) + ] data.append( { @@ -685,27 +755,27 @@ class Llama: return list(map(float, self.create_embedding(input)["data"][0]["embedding"])) def _create_completion( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 16, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 16, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None @@ -757,19 +827,19 @@ class Llama: finish_reason = "length" multibyte_fix = 0 for token in self.generate( - prompt_tokens, - top_k=top_k, - top_p=top_p, - temp=temperature, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - repeat_penalty=repeat_penalty, - stopping_criteria=stopping_criteria, - logits_processor=logits_processor, + prompt_tokens, + top_k=top_k, + top_p=top_p, + temp=temperature, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + repeat_penalty=repeat_penalty, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ): if token == self._token_eos: text = self.detokenize(completion_tokens) @@ -821,7 +891,7 @@ class Llama: token_end_position += len(self.detokenize([token])) # Check if stop sequence is in the token if token_end_position >= ( - remaining_length - first_stop_position - 1 + remaining_length - first_stop_position - 1 ): break logprobs_or_none: Optional[CompletionLogprobs] = None @@ -882,7 +952,7 @@ class Llama: break if stopping_criteria is not None and stopping_criteria( - self._input_ids.tolist(), self._scores[-1, :].tolist() + self._input_ids.tolist(), self._scores[-1, :].tolist() ): text = self.detokenize(completion_tokens) finish_reason = "stop" @@ -947,8 +1017,8 @@ class Llama: "choices": [ { "text": last_text[ - : len(last_text) - (token_end_position - end) - ].decode("utf-8", errors="ignore"), + : len(last_text) - (token_end_position - end) + ].decode("utf-8", errors="ignore"), "index": 0, "logprobs": logprobs_or_none, "finish_reason": finish_reason, @@ -1014,10 +1084,10 @@ class Llama: for token in all_tokens ] all_logprobs = [ - Llama.logits_to_logprobs(row.tolist()) for row in self._scores - ][token_offset:] + Llama.logits_to_logprobs(row.tolist()) for row in self._scores + ][token_offset:] for token, token_str, logprobs_token in zip( - all_tokens, all_token_strs, all_logprobs + all_tokens, all_token_strs, all_logprobs ): text_offsets.append(text_offset) text_offset += len(token_str) @@ -1068,27 +1138,27 @@ class Llama: } def create_completion( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 128, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 128, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -1141,27 +1211,27 @@ class Llama: return completion def __call__( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 128, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 128, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -1209,7 +1279,7 @@ class Llama: ) def _convert_text_completion_to_chat( - self, completion: Completion + self, completion: Completion ) -> ChatCompletion: return { "id": "chat" + completion["id"], @@ -1230,8 +1300,8 @@ class Llama: } def _convert_text_completion_chunks_to_chat( - self, - chunks: Iterator[CompletionChunk], + self, + chunks: Iterator[CompletionChunk], ) -> Iterator[ChatCompletionChunk]: for i, chunk in enumerate(chunks): if i == 0: @@ -1267,22 +1337,22 @@ class Llama: } def create_chat_completion( - self, - messages: List[ChatCompletionMessage], - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - max_tokens: int = 256, - presence_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, + self, + messages: List[ChatCompletionMessage], + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + max_tokens: int = 256, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. From 1848afebe0bb6ef735d29743a53caabba0cf1caf Mon Sep 17 00:00:00 2001 From: Maximilian-Winter Date: Wed, 31 May 2023 22:41:35 +0200 Subject: [PATCH 319/443] Added dependencies. --- pyproject.toml | 1 + setup.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9f83e19..45e1b8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ include = [ python = "^3.8.1" typing-extensions = "^4.6.2" numpy = "^1.20.0" +diskcache = "^5.6.1" uvicorn = { version = "^0.22.0", optional = true } fastapi = { version = "^0.95.0", optional = true } sse-starlette = { version = "^1.3.3", optional = true } diff --git a/setup.py b/setup.py index 39e1416..ac52c78 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ setup( license="MIT", package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"}, packages=["llama_cpp", "llama_cpp.server"], - install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0"], + install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], extras_require={ "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], }, From 5377f9784aec86970cacaf0f61689a4f41badde9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 31 May 2023 23:24:52 -0400 Subject: [PATCH 320/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 66874d4..ffb06a3 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 66874d4fbcc7866377246efbcee938e8cc9c7d76 +Subproject commit ffb06a345e3a9e30d39aaa5b46a23201a74be6de From 71f4582d4469ba74529386abb66a835e3ad1c374 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 31 May 2023 23:25:39 -0400 Subject: [PATCH 321/443] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9f83e19..0a0e569 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.56" +version = "0.1.57" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 39e1416..04d0554 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.56", + version="0.1.57", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From cf4931a4006a1d701f2c4ea5b2ce3cb02350d57d Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Fri, 2 Jun 2023 08:48:54 +0000 Subject: [PATCH 322/443] Working Open Llama 3B in a box --- docker/README.md | 5 ++-- docker/{auto_docker => open_llama}/Dockerfile | 0 docker/open_llama/build.sh | 14 +++++++++ .../{auto_docker => open_llama}/hug_model.py | 29 ++++++++++++------- docker/open_llama/start.sh | 28 ++++++++++++++++++ .../start_server.sh | 2 +- 6 files changed, 64 insertions(+), 14 deletions(-) rename docker/{auto_docker => open_llama}/Dockerfile (100%) create mode 100755 docker/open_llama/build.sh rename docker/{auto_docker => open_llama}/hug_model.py (83%) create mode 100755 docker/open_llama/start.sh rename docker/{auto_docker => open_llama}/start_server.sh (94%) diff --git a/docker/README.md b/docker/README.md index e61095f..2fb7ef8 100644 --- a/docker/README.md +++ b/docker/README.md @@ -24,7 +24,7 @@ - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` ## Download a Llama Model from Hugging Face -- To download a MIT licensed Llama model run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml` +- To download a MIT licensed Llama model you can run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml -f ggml-model-q5_1.bin` - To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama` - You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. ``` @@ -37,9 +37,10 @@ lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> q5_ | Model | Quantized size | |------:|----------------:| +| 3B | 3 GB | | 7B | 5 GB | | 13B | 10 GB | -| 30B | 25 GB | +| 33B | 25 GB | | 65B | 50 GB | **Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` diff --git a/docker/auto_docker/Dockerfile b/docker/open_llama/Dockerfile similarity index 100% rename from docker/auto_docker/Dockerfile rename to docker/open_llama/Dockerfile diff --git a/docker/open_llama/build.sh b/docker/open_llama/build.sh new file mode 100755 index 0000000..3a6457d --- /dev/null +++ b/docker/open_llama/build.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +MODEL="open_llama_3b" +# Get open_llama_3b_ggml q5_1 quantization +python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1" +ls -lh *.bin + +# Build the default OpenBLAS image +docker build -t $MODEL . +docker images | egrep "^(REPOSITORY|$MODEL)" + +echo +echo "To start the docker container run:" +echo "docker run -t -p 8000:8000 $MODEL" diff --git a/docker/auto_docker/hug_model.py b/docker/open_llama/hug_model.py similarity index 83% rename from docker/auto_docker/hug_model.py rename to docker/open_llama/hug_model.py index 86a8214..13c5b6b 100644 --- a/docker/auto_docker/hug_model.py +++ b/docker/open_llama/hug_model.py @@ -76,13 +76,15 @@ def main(): # Arguments parser.add_argument('-v', '--version', type=int, default=0x0003, - help='an integer for the version to be used') + help='hexadecimal version number of ggml file') parser.add_argument('-a', '--author', type=str, default='TheBloke', - help='an author to be filtered') - parser.add_argument('-t', '--tags', type=str, default='llama', - help='tags for the content') + help='HuggingFace author filter') + parser.add_argument('-t', '--tag', type=str, default='llama', + help='HuggingFace tag filter') parser.add_argument('-s', '--search', type=str, default='', - help='search term') + help='HuggingFace search filter') + parser.add_argument('-f', '--filename', type=str, default='q5_1', + help='HuggingFace model repository filename substring match') # Parse the arguments args = parser.parse_args() @@ -90,7 +92,7 @@ def main(): # Define the parameters params = { "author": args.author, - "tags": args.tags, + "tags": args.tag, "search": args.search } @@ -108,11 +110,15 @@ def main(): for sibling in model_info.get('siblings', []): rfilename = sibling.get('rfilename') - if rfilename and 'q5_1' in rfilename: + if rfilename and args.filename in rfilename: model_list.append((model_id, rfilename)) # Choose the model - if len(model_list) == 1: + model_list.sort(key=lambda x: x[0]) + if len(model_list) == 0: + print("No models found") + exit(1) + elif len(model_list) == 1: model_choice = model_list[0] else: model_choice = get_user_choice(model_list) @@ -120,13 +126,14 @@ def main(): if model_choice is not None: model_id, rfilename = model_choice url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" - download_file(url, rfilename) - _, version = check_magic_and_version(rfilename) + dest = f"{model_id.replace('/', '_')}_{rfilename}" + download_file(url, dest) + _, version = check_magic_and_version(dest) if version != args.version: print(f"Warning: Expected version {args.version}, but found different version in the file.") else: print("Error - model choice was None") - exit(1) + exit(2) if __name__ == '__main__': main() diff --git a/docker/open_llama/start.sh b/docker/open_llama/start.sh new file mode 100755 index 0000000..7ee8f74 --- /dev/null +++ b/docker/open_llama/start.sh @@ -0,0 +1,28 @@ +#!/bin/sh + +MODEL="open_llama_3b" + +# Start Docker container +docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL & +sleep 10 +echo +docker ps | egrep "(^CONTAINER|$MODEL)" + +# Test the model works +echo +curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{ + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": [ + "\n", + "###" + ] +}' | grep Paris +if [ $? -eq 0 ] +then + echo + echo "$MODEL is working!!" +else + echo + echo "ERROR: $MODEL not replying." + exit 1 +fi diff --git a/docker/auto_docker/start_server.sh b/docker/open_llama/start_server.sh similarity index 94% rename from docker/auto_docker/start_server.sh rename to docker/open_llama/start_server.sh index 176bd87..d3329ee 100755 --- a/docker/auto_docker/start_server.sh +++ b/docker/open_llama/start_server.sh @@ -1,6 +1,6 @@ #!/bin/sh -# For mmap support +# For mlock support ulimit -l unlimited if [ "$IMAGE" = "python:3-slim-bullseye" ]; then From f24e7a7e5229448ba64ab819287d07887567840d Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Fri, 2 Jun 2023 10:44:52 +0000 Subject: [PATCH 323/443] Updated instructions --- docker/README.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docker/README.md b/docker/README.md index 2fb7ef8..f4954d1 100644 --- a/docker/README.md +++ b/docker/README.md @@ -18,14 +18,15 @@ - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple` where `/` is the full path to the model file on the Docker host system. -# "Bot-in-a-box" - a method to build a Docker image by choosing a model to be downloaded and loading into a Docker image - - `cd ./auto_docker`: - - `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke) -- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` - -## Download a Llama Model from Hugging Face -- To download a MIT licensed Llama model you can run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml -f ggml-model-q5_1.bin` -- To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama` +# "Open-Llama-in-a-box" - Download a MIT licensed Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server +``` +$ cd ./open_llama +./build.sh +./start.sh +``` + +# Manually choose your own Llama model from Hugging Face +- `python3 ./hug_model.py -a TheBloke -t llama` - You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. ``` docker $ ls -lh *.bin From d4eef735d9d70cf1d8a9e098914b16ccf70f06fe Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Fri, 2 Jun 2023 11:03:19 +0000 Subject: [PATCH 324/443] Fixed .gitignore to ignore any downloaded model .bin files. Cleaned up README.md again --- .gitignore | 4 ++-- docker/README.md | 25 +++++++++++++++---------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 8db9bcb..79093b4 100644 --- a/.gitignore +++ b/.gitignore @@ -165,5 +165,5 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ -# model .bin files -docker/auto_docker/*.bin +# downloaded model .bin files +docker/open_llama/*.bin diff --git a/docker/README.md b/docker/README.md index f4954d1..c7e92d0 100644 --- a/docker/README.md +++ b/docker/README.md @@ -7,16 +7,21 @@ **Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) # Simple Dockerfiles for building the llama-cpp-python server with external model bin files -- `./openblas_simple/Dockerfile` - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image - - `cd ./openblas_simple` - - `docker build -t openblas_simple .` - - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t openblas_simple` - where `/` is the full path to the model file on the Docker host system. -- `./cuda_simple/Dockerfile` - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image - - `cd ./cuda_simple` - - `docker build -t cuda_simple .` - - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple` - where `/` is the full path to the model file on the Docker host system. +## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image +``` +cd ./openblas_simple +docker build -t openblas_simple . +docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t openblas_simple +``` +where `/` is the full path to the model file on the Docker host system. + +## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image +``` +cd ./cuda_simple +docker build -t cuda_simple . +docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple +``` +where `/` is the full path to the model file on the Docker host system. # "Open-Llama-in-a-box" - Download a MIT licensed Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server ``` From 30d32e996b3bbb4ad641ab275cf1d985f950d1cd Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Fri, 2 Jun 2023 11:08:59 +0000 Subject: [PATCH 325/443] More README.md corrections and cleanup --- docker/README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docker/README.md b/docker/README.md index c7e92d0..053d311 100644 --- a/docker/README.md +++ b/docker/README.md @@ -4,7 +4,7 @@ [Install Docker Engine](https://docs.docker.com/engine/install) -**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) +**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) # Simple Dockerfiles for building the llama-cpp-python server with external model bin files ## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image @@ -23,7 +23,8 @@ docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v : ``` where `/` is the full path to the model file on the Docker host system. -# "Open-Llama-in-a-box" - Download a MIT licensed Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server +# "Open-Llama-in-a-box" +## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server ``` $ cd ./open_llama ./build.sh @@ -31,8 +32,8 @@ $ cd ./open_llama ``` # Manually choose your own Llama model from Hugging Face -- `python3 ./hug_model.py -a TheBloke -t llama` -- You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. +`python3 ./hug_model.py -a TheBloke -t llama` +You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. ``` docker $ ls -lh *.bin -rw-rw-r-- 1 user user 4.8G May 23 18:30 q5_1.bin From 76e364cdf2f580778a893cc7a3c456235c1b894a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 4 Jun 2023 23:30:10 -0400 Subject: [PATCH 326/443] Added 0.1.57 notes --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d9f52da..56e38c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [v0.1.57] + +- Added: OpenLlama 3B support + ## [v0.1.56] ### Added From 6d5b049801568b8f25f6cc0ba433c43e471750bc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 4 Jun 2023 23:30:42 -0400 Subject: [PATCH 327/443] Update llama.cpp --- CHANGELOG.md | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56e38c4..8410ed3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Added: Metal Silicon support + ## [v0.1.57] - Added: OpenLlama 3B support diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ffb06a3..827f5ed 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ffb06a345e3a9e30d39aaa5b46a23201a74be6de +Subproject commit 827f5eda91e5b7299848ee2c7179d873bdee0f7b From 18c7b8520e849bbd6a0260778a0a7243a3fff545 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 4 Jun 2023 23:31:51 -0400 Subject: [PATCH 328/443] Bump version --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8410ed3..a5006ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [v0.1.58] + - Added: Metal Silicon support ## [v0.1.57] diff --git a/pyproject.toml b/pyproject.toml index 0a0e569..f2d610a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.57" +version = "0.1.58" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 04d0554..7a0cdc3 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.57", + version="0.1.58", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 4bcaa5293c8a7e4f00981516658fa3824c2f1633 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Jun 2023 21:04:04 +0000 Subject: [PATCH 329/443] Bump sse-starlette from 1.5.0 to 1.6.1 Bumps [sse-starlette](https://github.com/sysid/sse-starlette) from 1.5.0 to 1.6.1. - [Release notes](https://github.com/sysid/sse-starlette/releases) - [Changelog](https://github.com/sysid/sse-starlette/blob/master/CHANGELOG.md) - [Commits](https://github.com/sysid/sse-starlette/compare/v1.5.0...v1.6.1) --- updated-dependencies: - dependency-name: sse-starlette dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 46 ++++++++++++++++++++++++++++++++++++++++++---- pyproject.toml | 2 +- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 70e4272..8c35153 100644 --- a/poetry.lock +++ b/poetry.lock @@ -901,6 +901,44 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "numpy" +version = "1.24.3" +description = "Fundamental package for array computing in Python" +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "numpy-1.24.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3c1104d3c036fb81ab923f507536daedc718d0ad5a8707c6061cdfd6d184e570"}, + {file = "numpy-1.24.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:202de8f38fc4a45a3eea4b63e2f376e5f2dc64ef0fa692838e31a808520efaf7"}, + {file = "numpy-1.24.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8535303847b89aa6b0f00aa1dc62867b5a32923e4d1681a35b5eef2d9591a463"}, + {file = "numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d926b52ba1367f9acb76b0df6ed21f0b16a1ad87c6720a1121674e5cf63e2b6"}, + {file = "numpy-1.24.3-cp310-cp310-win32.whl", hash = "sha256:f21c442fdd2805e91799fbe044a7b999b8571bb0ab0f7850d0cb9641a687092b"}, + {file = "numpy-1.24.3-cp310-cp310-win_amd64.whl", hash = "sha256:ab5f23af8c16022663a652d3b25dcdc272ac3f83c3af4c02eb8b824e6b3ab9d7"}, + {file = "numpy-1.24.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9a7721ec204d3a237225db3e194c25268faf92e19338a35f3a224469cb6039a3"}, + {file = "numpy-1.24.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d6cc757de514c00b24ae8cf5c876af2a7c3df189028d68c0cb4eaa9cd5afc2bf"}, + {file = "numpy-1.24.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76e3f4e85fc5d4fd311f6e9b794d0c00e7002ec122be271f2019d63376f1d385"}, + {file = "numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1d3c026f57ceaad42f8231305d4653d5f05dc6332a730ae5c0bea3513de0950"}, + {file = "numpy-1.24.3-cp311-cp311-win32.whl", hash = "sha256:c91c4afd8abc3908e00a44b2672718905b8611503f7ff87390cc0ac3423fb096"}, + {file = "numpy-1.24.3-cp311-cp311-win_amd64.whl", hash = "sha256:5342cf6aad47943286afa6f1609cad9b4266a05e7f2ec408e2cf7aea7ff69d80"}, + {file = "numpy-1.24.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7776ea65423ca6a15255ba1872d82d207bd1e09f6d0894ee4a64678dd2204078"}, + {file = "numpy-1.24.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ae8d0be48d1b6ed82588934aaaa179875e7dc4f3d84da18d7eae6eb3f06c242c"}, + {file = "numpy-1.24.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecde0f8adef7dfdec993fd54b0f78183051b6580f606111a6d789cd14c61ea0c"}, + {file = "numpy-1.24.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4749e053a29364d3452c034827102ee100986903263e89884922ef01a0a6fd2f"}, + {file = "numpy-1.24.3-cp38-cp38-win32.whl", hash = "sha256:d933fabd8f6a319e8530d0de4fcc2e6a61917e0b0c271fded460032db42a0fe4"}, + {file = "numpy-1.24.3-cp38-cp38-win_amd64.whl", hash = "sha256:56e48aec79ae238f6e4395886b5eaed058abb7231fb3361ddd7bfdf4eed54289"}, + {file = "numpy-1.24.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4719d5aefb5189f50887773699eaf94e7d1e02bf36c1a9d353d9f46703758ca4"}, + {file = "numpy-1.24.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0ec87a7084caa559c36e0a2309e4ecb1baa03b687201d0a847c8b0ed476a7187"}, + {file = "numpy-1.24.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea8282b9bcfe2b5e7d491d0bf7f3e2da29700cec05b49e64d6246923329f2b02"}, + {file = "numpy-1.24.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:210461d87fb02a84ef243cac5e814aad2b7f4be953b32cb53327bb49fd77fbb4"}, + {file = "numpy-1.24.3-cp39-cp39-win32.whl", hash = "sha256:784c6da1a07818491b0ffd63c6bbe5a33deaa0e25a20e1b3ea20cf0e43f8046c"}, + {file = "numpy-1.24.3-cp39-cp39-win_amd64.whl", hash = "sha256:d5036197ecae68d7f491fcdb4df90082b0d4960ca6599ba2659957aafced7c17"}, + {file = "numpy-1.24.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:352ee00c7f8387b44d19f4cada524586f07379c0d49270f87233983bc5087ca0"}, + {file = "numpy-1.24.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7d6acc2e7524c9955e5c903160aa4ea083736fde7e91276b0e5d98e6332812"}, + {file = "numpy-1.24.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:35400e6a8d102fd07c71ed7dcadd9eb62ee9a6e84ec159bd48c28235bbb0f8e4"}, + {file = "numpy-1.24.3.tar.gz", hash = "sha256:ab344f1bf21f140adab8e47fdbc7c35a477dc01408791f8ba00d018dd0bc5155"}, +] + [[package]] name = "packaging" version = "23.1" @@ -1457,14 +1495,14 @@ files = [ [[package]] name = "sse-starlette" -version = "1.5.0" +version = "1.6.1" description = "\"SSE plugin for Starlette\"" category = "main" optional = true python-versions = ">=3.8" files = [ - {file = "sse-starlette-1.5.0.tar.gz", hash = "sha256:4fa989d906f29ba456a047071cbd9eab8c934042d5da4660543ad4b61c59c092"}, - {file = "sse_starlette-1.5.0-py3-none-any.whl", hash = "sha256:b41aac15f83191a4fc381e8cd152285cd44e328f409dc2bdfd4b7d7f33ea3865"}, + {file = "sse-starlette-1.6.1.tar.gz", hash = "sha256:6208af2bd7d0887c92f1379da14bd1f4db56bd1274cc5d36670c683d2aa1de6a"}, + {file = "sse_starlette-1.6.1-py3-none-any.whl", hash = "sha256:d8f18f1c633e355afe61cc5e9c92eea85badcb8b2d56ec8cfb0a006994aa55da"}, ] [package.dependencies] @@ -1662,4 +1700,4 @@ server = ["uvicorn", "fastapi", "sse-starlette"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "f5aacb68729427e49bb796a598890fedd8ba1950af3fd577fb85edde2c27338f" +content-hash = "0a84fbb944af1a0fe38038389c6baf1e987d98e7cbfd35cbf5642f7e123aa16f" diff --git a/pyproject.toml b/pyproject.toml index f2d610a..b2a0f53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ typing-extensions = "^4.6.2" numpy = "^1.20.0" uvicorn = { version = "^0.22.0", optional = true } fastapi = { version = "^0.95.0", optional = true } -sse-starlette = { version = "^1.3.3", optional = true } +sse-starlette = { version = "^1.6.1", optional = true } [tool.poetry.group.dev.dependencies] black = "^23.3.0" From 7b57420ea98e0ab957955ff05507939db1a23986 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 5 Jun 2023 18:17:29 -0400 Subject: [PATCH 330/443] Update llama.cpp --- CHANGELOG.md | 2 ++ llama_cpp/llama_cpp.py | 18 ++++++++++++++++++ vendor/llama.cpp | 2 +- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5006ed..ce839f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Added: k-quants support + ## [v0.1.58] - Added: Metal Silicon support diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 541ee00..11d0ad4 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -191,6 +191,15 @@ llama_context_params_p = POINTER(llama_context_params) # LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors # }; LLAMA_FTYPE_ALL_F32 = c_int(0) LLAMA_FTYPE_MOSTLY_F16 = c_int(1) @@ -200,6 +209,15 @@ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) +LLAMA_FTYPE_MOSTLY_Q2_K = c_int(10) +LLAMA_FTYPE_MOSTLY_Q3_K_S = c_int(11) +LLAMA_FTYPE_MOSTLY_Q3_K_M = c_int(12) +LLAMA_FTYPE_MOSTLY_Q3_K_L = c_int(13) +LLAMA_FTYPE_MOSTLY_Q4_K_S = c_int(14) +LLAMA_FTYPE_MOSTLY_Q4_K_M = c_int(15) +LLAMA_FTYPE_MOSTLY_Q5_K_S = c_int(16) +LLAMA_FTYPE_MOSTLY_Q5_K_M = c_int(17) +LLAMA_FTYPE_MOSTLY_Q6_K = c_int(18) # LLAMA_API struct llama_context_params llama_context_default_params(); diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 827f5ed..f4c55d3 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 827f5eda91e5b7299848ee2c7179d873bdee0f7b +Subproject commit f4c55d3bd7e124b101bc974cbbf0e0dbbc32d5a3 From 9e400616648fb98c3408b745264dd840f51898f8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Jun 2023 23:04:20 +0000 Subject: [PATCH 331/443] Bump fastapi from 0.95.1 to 0.96.0 Bumps [fastapi](https://github.com/tiangolo/fastapi) from 0.95.1 to 0.96.0. - [Release notes](https://github.com/tiangolo/fastapi/releases) - [Commits](https://github.com/tiangolo/fastapi/compare/0.95.1...0.96.0) --- updated-dependencies: - dependency-name: fastapi dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 16 ++++++++-------- pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/poetry.lock b/poetry.lock index 8c35153..f1ecf8e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -374,19 +374,19 @@ test = ["pytest (>=6)"] [[package]] name = "fastapi" -version = "0.95.1" +version = "0.96.0" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" category = "main" optional = true python-versions = ">=3.7" files = [ - {file = "fastapi-0.95.1-py3-none-any.whl", hash = "sha256:a870d443e5405982e1667dfe372663abf10754f246866056336d7f01c21dab07"}, - {file = "fastapi-0.95.1.tar.gz", hash = "sha256:9569f0a381f8a457ec479d90fa01005cfddaae07546eb1f3fa035bc4797ae7d5"}, + {file = "fastapi-0.96.0-py3-none-any.whl", hash = "sha256:b8e11fe81e81eab4e1504209917338e0b80f783878a42c2b99467e5e1019a1e9"}, + {file = "fastapi-0.96.0.tar.gz", hash = "sha256:71232d47c2787446991c81c41c249f8a16238d52d779c0e6b43927d3773dbe3c"}, ] [package.dependencies] pydantic = ">=1.6.2,<1.7 || >1.7,<1.7.1 || >1.7.1,<1.7.2 || >1.7.2,<1.7.3 || >1.7.3,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0" -starlette = ">=0.26.1,<0.27.0" +starlette = ">=0.27.0,<0.28.0" [package.extras] all = ["email-validator (>=1.1.1)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] @@ -1510,14 +1510,14 @@ starlette = "*" [[package]] name = "starlette" -version = "0.26.1" +version = "0.27.0" description = "The little ASGI library that shines." category = "main" optional = true python-versions = ">=3.7" files = [ - {file = "starlette-0.26.1-py3-none-any.whl", hash = "sha256:e87fce5d7cbdde34b76f0ac69013fd9d190d581d80681493016666e6f96c6d5e"}, - {file = "starlette-0.26.1.tar.gz", hash = "sha256:41da799057ea8620e4667a3e69a5b1923ebd32b1819c8fa75634bbe8d8bea9bd"}, + {file = "starlette-0.27.0-py3-none-any.whl", hash = "sha256:918416370e846586541235ccd38a474c08b80443ed31c578a418e2209b3eef91"}, + {file = "starlette-0.27.0.tar.gz", hash = "sha256:6a6b0d042acb8d469a01eba54e9cda6cbd24ac602c4cd016723117d6a7e73b75"}, ] [package.dependencies] @@ -1700,4 +1700,4 @@ server = ["uvicorn", "fastapi", "sse-starlette"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "0a84fbb944af1a0fe38038389c6baf1e987d98e7cbfd35cbf5642f7e123aa16f" +content-hash = "4eefe8dc7ed2ae6262828e6f4329d0539b667ac113221dc3e927b166b8bb8619" diff --git a/pyproject.toml b/pyproject.toml index b2a0f53..a7a5ceb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ python = "^3.8.1" typing-extensions = "^4.6.2" numpy = "^1.20.0" uvicorn = { version = "^0.22.0", optional = true } -fastapi = { version = "^0.95.0", optional = true } +fastapi = { version = "^0.96.0", optional = true } sse-starlette = { version = "^1.6.1", optional = true } [tool.poetry.group.dev.dependencies] From fa7285c51f0e4efcbbe5d9ab75d183e485f80ff8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Jun 2023 23:30:17 +0000 Subject: [PATCH 332/443] Bump scikit-build from 0.17.5 to 0.17.6 Bumps [scikit-build](https://github.com/scikit-build/scikit-build) from 0.17.5 to 0.17.6. - [Release notes](https://github.com/scikit-build/scikit-build/releases) - [Changelog](https://github.com/scikit-build/scikit-build/blob/main/CHANGES.rst) - [Commits](https://github.com/scikit-build/scikit-build/compare/0.17.5...0.17.6) --- updated-dependencies: - dependency-name: scikit-build dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 10 +++++----- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index f1ecf8e..5d3c4e1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1413,14 +1413,14 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "scikit-build" -version = "0.17.5" +version = "0.17.6" description = "Improved build system generator for Python C/C++/Fortran/Cython extensions" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "scikit_build-0.17.5-py3-none-any.whl", hash = "sha256:18861286b34fd2d685327d3bec6ebf4d33303adfaef28a08dd856710d16cf20f"}, - {file = "scikit_build-0.17.5.tar.gz", hash = "sha256:76856e7631d9e8887a7aa71913d5f184a6177246225391af96ce4801d89fa254"}, + {file = "scikit_build-0.17.6-py3-none-any.whl", hash = "sha256:18bd55e81841106eec93f30a297df4f301003791c41be46ef6428d58bd42d6b3"}, + {file = "scikit_build-0.17.6.tar.gz", hash = "sha256:b51a51a36b37c42650994b5047912f59b22e3210b23e321f287611f9ef6e5c9d"}, ] [package.dependencies] @@ -1434,7 +1434,7 @@ wheel = ">=0.32.0" cov = ["coverage[toml] (>=4.2)", "pytest-cov (>=2.7.1)"] docs = ["pygments", "sphinx (>=4)", "sphinx-issues", "sphinx-rtd-theme (>=1.0)", "sphinxcontrib-moderncmakedomain (>=3.19)"] doctest = ["ubelt (>=0.8.2)", "xdoctest (>=0.10.0)"] -test = ["build (>=0.7)", "cython (>=0.25.1)", "importlib-metadata", "pytest (>=6.0.0)", "pytest-mock (>=1.10.4)", "pytest-virtualenv (>=1.2.5)", "requests", "virtualenv"] +test = ["build (>=0.7)", "cython (>=0.25.1)", "importlib-metadata", "pytest (>=6.0.0)", "pytest-mock (>=1.10.4)", "requests", "virtualenv"] [[package]] name = "secretstorage" @@ -1700,4 +1700,4 @@ server = ["uvicorn", "fastapi", "sse-starlette"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "4eefe8dc7ed2ae6262828e6f4329d0539b667ac113221dc3e927b166b8bb8619" +content-hash = "64bfca53778d0ca567082252ee03d951ed9e3f14dde336db0fbd3f2f384e8184" diff --git a/pyproject.toml b/pyproject.toml index a7a5ceb..ad8df43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ mkdocstrings = {extras = ["python"], version = "^0.22.0"} mkdocs-material = "^9.1.15" pytest = "^7.3.1" httpx = "^0.24.1" -scikit-build = "0.17.5" +scikit-build = "0.17.6" [tool.poetry.extras] server = ["uvicorn", "fastapi", "sse-starlette"] From 5702d30a83a279075a7bcd5f66f451efcbe90980 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Jun 2023 23:46:45 +0000 Subject: [PATCH 333/443] Bump typing-extensions from 4.6.2 to 4.6.3 Bumps [typing-extensions](https://github.com/python/typing_extensions) from 4.6.2 to 4.6.3. - [Changelog](https://github.com/python/typing_extensions/blob/main/CHANGELOG.md) - [Commits](https://github.com/python/typing_extensions/compare/4.6.2...4.6.3) --- updated-dependencies: - dependency-name: typing-extensions dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 5d3c4e1..4a9c572 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1564,14 +1564,14 @@ urllib3 = ">=1.26.0" [[package]] name = "typing-extensions" -version = "4.6.2" +version = "4.6.3" description = "Backported and Experimental Type Hints for Python 3.7+" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "typing_extensions-4.6.2-py3-none-any.whl", hash = "sha256:3a8b36f13dd5fdc5d1b16fe317f5668545de77fa0b8e02006381fd49d731ab98"}, - {file = "typing_extensions-4.6.2.tar.gz", hash = "sha256:06006244c70ac8ee83fa8282cb188f697b8db25bc8b4df07be1873c43897060c"}, + {file = "typing_extensions-4.6.3-py3-none-any.whl", hash = "sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26"}, + {file = "typing_extensions-4.6.3.tar.gz", hash = "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"}, ] [[package]] @@ -1700,4 +1700,4 @@ server = ["uvicorn", "fastapi", "sse-starlette"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "64bfca53778d0ca567082252ee03d951ed9e3f14dde336db0fbd3f2f384e8184" +content-hash = "5c3354c253bc7ab7c7577a9a3733c7a341e91176e1d0c13dc2e3f3dcc0971bbe" diff --git a/pyproject.toml b/pyproject.toml index ad8df43..09991e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ include = [ [tool.poetry.dependencies] python = "^3.8.1" -typing-extensions = "^4.6.2" +typing-extensions = "^4.6.3" numpy = "^1.20.0" uvicorn = { version = "^0.22.0", optional = true } fastapi = { version = "^0.96.0", optional = true } From 9b1c9e902c7846a2cf5d88ce65d35a5f9d9c5f3a Mon Sep 17 00:00:00 2001 From: Eric B Date: Mon, 5 Jun 2023 22:37:11 -0400 Subject: [PATCH 334/443] Added mirostat support for completions, chat completions API --- llama_cpp/server/app.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ea9dec4..23382e1 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -191,6 +191,27 @@ frequency_penalty_field = Field( description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", ) +mirostat_mode_field = Field( + default=0, + ge=0, + le=2, + description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)" +) + +mirostat_tau_field = Field( + default=5.0, + ge=0.0, + le=10.0, + description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text" +) + +mirostat_eta_field = Field( + default=0.1, + ge=0.001, + le=1.0, + description="Mirostat learning rate" +) + class CreateCompletionRequest(BaseModel): prompt: Union[str, List[str]] = Field( @@ -203,6 +224,9 @@ class CreateCompletionRequest(BaseModel): max_tokens: int = max_tokens_field temperature: float = temperature_field top_p: float = top_p_field + mirostat_mode: int = mirostat_mode_field + mirostat_tau: float = mirostat_tau_field + mirostat_eta: float = mirostat_eta_field echo: bool = Field( default=False, description="Whether to echo the prompt in the generated text. Useful for chatbots.", @@ -332,6 +356,9 @@ class CreateChatCompletionRequest(BaseModel): max_tokens: int = max_tokens_field temperature: float = temperature_field top_p: float = top_p_field + mirostat_mode: int = mirostat_mode_field + mirostat_tau: float = mirostat_tau_field + mirostat_eta: float = mirostat_eta_field stop: Optional[List[str]] = stop_field stream: bool = stream_field presence_penalty: Optional[float] = presence_penalty_field From 8b4968ea673b9cb7a218712d4bf2c74703ac95b8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 6 Jun 2023 11:37:57 -0400 Subject: [PATCH 335/443] Fix resize issue. Closes #330 --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 18372c8..2a96ff8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -222,7 +222,7 @@ class Llama: [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True ), ) - self._candidates_data.resize(3, self._n_vocab) + self._candidates_data.resize(3, self._n_vocab, refcheck=False) candidates = llama_cpp.llama_token_data_array( data=self._candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p), size=size, From aad4b17f5259829b88584845c20ab27bf42bcb00 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 6 Jun 2023 16:23:55 -0400 Subject: [PATCH 336/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 17 +++++++++++++---- vendor/llama.cpp | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 11d0ad4..a1634fa 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -79,6 +79,10 @@ c_size_t_p = POINTER(c_size_t) # llama.h bindings +GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas") +GGML_CUDA_MAX_DEVICES = ctypes.c_int(16) +LLAMA_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1) + # #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) # #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' @@ -142,9 +146,12 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # struct llama_context_params { -# int n_ctx; // text context -# int n_gpu_layers; // number of layers to store in VRAM -# int seed; // RNG seed, -1 for random +# int n_ctx; // text context +# int n_batch; // prompt processing batch size +# int n_gpu_layers; // number of layers to store in VRAM +# int main_gpu; // the GPU that is used for scratch and small tensors +# float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs +# int seed; // RNG seed, -1 for random # bool f16_kv; // use fp16 for KV cache # bool logits_all; // the llama_eval() call computes all logits, not just the last one @@ -153,7 +160,6 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # bool use_mlock; // force system to keep model in RAM # bool embedding; // embedding mode only - # // called with a progress value between 0 and 1, pass NULL to disable # llama_progress_callback progress_callback; # // context pointer passed to the progress callback @@ -162,7 +168,10 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) class llama_context_params(Structure): _fields_ = [ ("n_ctx", c_int), + ("n_batch", c_int), ("n_gpu_layers", c_int), + ("main_gpu", c_int), + ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), ("seed", c_int), ("f16_kv", c_bool), ( diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f4c55d3..2d7bf11 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f4c55d3bd7e124b101bc974cbbf0e0dbbc32d5a3 +Subproject commit 2d7bf110edd8c49209401a16132052cba706ffd0 From 0e156ffd6687251973b892b4f470eeee05bcc6b7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 6 Jun 2023 17:01:10 -0400 Subject: [PATCH 337/443] Fix changelog format --- CHANGELOG.md | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ce839f1..812d97f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,24 +7,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- Added: k-quants support +### Added + +- (llama.cpp) k-quants support +- (server) mirostat sampling parameters to server ## [v0.1.58] -- Added: Metal Silicon support +### Added + +- (llama.cpp) Metal Silicon support ## [v0.1.57] -- Added: OpenLlama 3B support +### Added + +- (llama.cpp) OpenLlama 3B support ## [v0.1.56] ### Added -- Added first version of the changelog -- Server: Use async routes -- Use numpy for internal buffers to reduce memory usage and improve performance. +- (misc) Added first version of the changelog +- (server) Use async routes +- (python-api) Use numpy for internal buffers to reduce memory usage and improve performance. ### Fixed -- Performance bug in stop sequence check slowing down streaming. \ No newline at end of file +- (python-api) Performance bug in stop sequence check slowing down streaming. \ No newline at end of file From cf6a9d6d8eace1feca1aec8554e26c316c140ef4 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 7 Jun 2023 03:42:55 -0400 Subject: [PATCH 338/443] Add framework and archive destinations to cmake for macos and windows. --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 16932b1..e5fac6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,5 +27,7 @@ else() TARGETS llama LIBRARY DESTINATION llama_cpp RUNTIME DESTINATION llama_cpp + ARCHIVE DESTINATION llama_cpp + FRAMEWORK DESTINATION llama_cpp ) endif() From bf322861e8120da88c3d4d87dd7d0392b1747a7b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 7 Jun 2023 03:43:33 -0400 Subject: [PATCH 339/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2d7bf11..5b57a5b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2d7bf110edd8c49209401a16132052cba706ffd0 +Subproject commit 5b57a5b72676540b6a45a3f527126299969ad241 From 69355403c6feb68a5578df74db86c7d3ab20eebc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 00:15:17 -0400 Subject: [PATCH 340/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 5b57a5b..5c64a09 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 5b57a5b72676540b6a45a3f527126299969ad241 +Subproject commit 5c64a0952ee58b2d742ee84e8e3d43cce5d366db From c9e79c66817551e3f6e5a740a314f4d017e4219f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 00:22:39 -0400 Subject: [PATCH 341/443] Add metal build flags --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index c1fe552..ddae25f 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,9 @@ build.openblas: build.blis: CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop +build.metal: + CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 setup.py develop + build.sdist: python3 setup.py sdist From 607d217caaa2dbe51612226b972024fb974bf876 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 00:27:19 -0400 Subject: [PATCH 342/443] Allow both .so and .dylib extensions for macos --- llama_cpp/llama_cpp.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index a1634fa..bb9b0e5 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -15,28 +15,32 @@ from ctypes import ( c_size_t, ) import pathlib +from typing import List # Load the library def _load_shared_library(lib_base_name: str): - # Determine the file extension based on the platform - if sys.platform.startswith("linux"): - lib_ext = ".so" - elif sys.platform == "darwin": - lib_ext = ".so" - elif sys.platform == "win32": - lib_ext = ".dll" - else: - raise RuntimeError("Unsupported platform") - # Construct the paths to the possible shared library names _base_path = pathlib.Path(__file__).parent.resolve() # Searching for the library in the current directory under the name "libllama" (default name # for llamacpp) and "llama" (default name for this repo) - _lib_paths = [ - _base_path / f"lib{lib_base_name}{lib_ext}", - _base_path / f"{lib_base_name}{lib_ext}", - ] + _lib_paths: List[pathlib.Path] = [] + # Determine the file extension based on the platform + if sys.platform.startswith("linux"): + _lib_paths += [ + _base_path / f"lib{lib_base_name}.so", + ] + elif sys.platform == "darwin": + _lib_paths += [ + _base_path / f"lib{lib_base_name}.so", + _base_path / f"lib{lib_base_name}.dylib", + ] + elif sys.platform == "win32": + _lib_paths += [ + _base_path / f"{lib_base_name}.dll", + ] + else: + raise RuntimeError("Unsupported platform") if "LLAMA_CPP_LIB" in os.environ: lib_base_name = os.environ["LLAMA_CPP_LIB"] @@ -160,6 +164,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # bool use_mlock; // force system to keep model in RAM # bool embedding; // embedding mode only + # // called with a progress value between 0 and 1, pass NULL to disable # llama_progress_callback progress_callback; # // context pointer passed to the progress callback From e169a960f6b762cbf9a4329bf3ee8a1ef988b453 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 00:40:04 -0400 Subject: [PATCH 343/443] Ensure make clean removes all shared library files --- Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ddae25f..d2f38da 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,9 @@ clean: - cd vendor/llama.cpp && make clean - cd vendor/llama.cpp && rm libllama.so - rm -rf _skbuild - - rm llama_cpp/libllama.so + - rm llama_cpp/*.so + - rm llama_cpp/*.dylib + - rm llama_cpp/*.dll .PHONY: \ update \ From 90874c01cde6df7031ff382646c9ebabfabe2afc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 03:26:49 -0400 Subject: [PATCH 344/443] Bump version --- CHANGELOG.md | 6 ++++++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 812d97f..e9546b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,11 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [v0.1.59] + ### Added - (llama.cpp) k-quants support - (server) mirostat sampling parameters to server +### Fixed + +- Support both `.so` and `.dylib` for `libllama` on MacOS + ## [v0.1.58] ### Added diff --git a/pyproject.toml b/pyproject.toml index ccbefe2..05c9271 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.58" +version = "0.1.59" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 4b0d7cb..20ebc95 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.58", + version="0.1.59", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 0c421685084415229ae2689fb399ce55e73d9daf Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 13:19:23 -0400 Subject: [PATCH 345/443] Fix cache implementation breaking changes --- llama_cpp/llama.py | 450 +++++++++++++++++++++------------------- llama_cpp/server/app.py | 9 + 2 files changed, 247 insertions(+), 212 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index b88fd65..05994b6 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -4,7 +4,7 @@ import uuid import time import math import multiprocessing -from abc import ABC +from abc import ABC, abstractmethod from typing import ( List, Optional, @@ -27,33 +27,37 @@ import numpy as np import numpy.typing as npt -class LlamaCache(ABC): +class BaseLlamaCache(ABC): """Base cache class for a llama.cpp model.""" def __init__(self, capacity_bytes: int = (2 << 30)): - pass + self.capacity_bytes = capacity_bytes @property - def cache_size(self): - return 0 + @abstractmethod + def cache_size(self) -> int: + raise NotImplementedError def _find_longest_prefix_key( - self, - key: Tuple[int, ...], + self, + key: Tuple[int, ...], ) -> Optional[Tuple[int, ...]]: pass + @abstractmethod def __getitem__(self, key: Sequence[int]) -> "LlamaState": - pass + raise NotImplementedError + @abstractmethod def __contains__(self, key: Sequence[int]) -> bool: - pass + raise NotImplementedError - def __setitem__(self, key: Sequence[int], value: "LlamaState"): - pass + @abstractmethod + def __setitem__(self, key: Sequence[int], value: "LlamaState") -> None: + raise NotImplementedError -class LlamaRAMCache(LlamaCache): +class LlamaRAMCache(BaseLlamaCache): """Cache for a llama.cpp model using RAM.""" def __init__(self, capacity_bytes: int = (2 << 30)): @@ -66,8 +70,8 @@ class LlamaRAMCache(LlamaCache): return sum([state.llama_state_size for state in self.cache_state.values()]) def _find_longest_prefix_key( - self, - key: Tuple[int, ...], + self, + key: Tuple[int, ...], ) -> Optional[Tuple[int, ...]]: min_len = 0 min_key = None @@ -97,32 +101,38 @@ class LlamaRAMCache(LlamaCache): if key in self.cache_state: del self.cache_state[key] self.cache_state[key] = value - while self.cache_size > self.capacity_bytes: + while self.cache_size > self.capacity_bytes and len(self.cache_state) > 0: self.cache_state.popitem(last=False) -class LlamaDiskCache(LlamaCache): +# Alias for backwards compatibility +LlamaCache = LlamaRAMCache + + +class LlamaDiskCache(BaseLlamaCache): """Cache for a llama.cpp model using disk.""" - def __init__(self, cache_dir="./llama_cache", capacity_bytes: int = (2 << 30)): + def __init__( + self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30) + ): super().__init__(capacity_bytes) self.cache = diskcache.Cache(cache_dir) @property def cache_size(self): - return self.cache.volume() + return int(self.cache.volume()) # type: ignore def _find_longest_prefix_key( - self, - key: Tuple[int, ...], + self, + key: Tuple[int, ...], ) -> Optional[Tuple[int, ...]]: min_len = 0 - min_key = None - for k in self.cache.iterkeys(): + min_key: Optional[Tuple[int, ...]] = None + for k in self.cache.iterkeys(): # type: ignore prefix_len = Llama.longest_token_prefix(k, key) if prefix_len > min_len: min_len = prefix_len - min_key = k + min_key = k # type: ignore return min_key def __getitem__(self, key: Sequence[int]) -> "LlamaState": @@ -130,29 +140,36 @@ class LlamaDiskCache(LlamaCache): _key = self._find_longest_prefix_key(key) if _key is None: raise KeyError("Key not found") - value = self.cache.pop(_key) - self.cache.push(_key) + value: "LlamaState" = self.cache.pop(_key) # type: ignore + self.cache.push(_key, side="front") # type: ignore return value + def __contains__(self, key: Sequence[int]) -> bool: + return self._find_longest_prefix_key(tuple(key)) is not None + def __setitem__(self, key: Sequence[int], value: "LlamaState"): + print("LlamaDiskCache.__setitem__: called", file=sys.stderr) key = tuple(key) if key in self.cache: + print("LlamaDiskCache.__setitem__: delete", file=sys.stderr) del self.cache[key] self.cache[key] = value - while self.cache_size > self.capacity_bytes: + print("LlamaDiskCache.__setitem__: set", file=sys.stderr) + while self.cache_size > self.capacity_bytes and len(self.cache) > 0: key_to_remove = next(iter(self.cache)) del self.cache[key_to_remove] + print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) class LlamaState: def __init__( - self, - eval_tokens: Deque[int], - eval_logits: Deque[List[float]], - input_ids: npt.NDArray[np.intc], - scores: npt.NDArray[np.single], - llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] - llama_state_size: int, + self, + eval_tokens: Deque[int], + eval_logits: Deque[List[float]], + input_ids: npt.NDArray[np.intc], + scores: npt.NDArray[np.single], + llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] + llama_state_size: int, ): self.eval_tokens = eval_tokens self.eval_logits = eval_logits @@ -184,25 +201,25 @@ class Llama: """High-level Python wrapper for a llama.cpp model.""" def __init__( - self, - model_path: str, - # NOTE: These parameters are likely to change in the future. - n_ctx: int = 512, - n_parts: int = -1, - n_gpu_layers: int = 0, - seed: int = 1337, - f16_kv: bool = True, - logits_all: bool = False, - vocab_only: bool = False, - use_mmap: bool = True, - use_mlock: bool = False, - embedding: bool = False, - n_threads: Optional[int] = None, - n_batch: int = 512, - last_n_tokens_size: int = 64, - lora_base: Optional[str] = None, - lora_path: Optional[str] = None, - verbose: bool = True, + self, + model_path: str, + # NOTE: These parameters are likely to change in the future. + n_ctx: int = 512, + n_parts: int = -1, + n_gpu_layers: int = 0, + seed: int = 1337, + f16_kv: bool = True, + logits_all: bool = False, + vocab_only: bool = False, + use_mmap: bool = True, + use_mlock: bool = False, + embedding: bool = False, + n_threads: Optional[int] = None, + n_batch: int = 512, + last_n_tokens_size: int = 64, + lora_base: Optional[str] = None, + lora_path: Optional[str] = None, + verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -249,7 +266,7 @@ class Llama: self.eval_tokens: Deque[int] = deque(maxlen=n_ctx) self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx if logits_all else 1) - self.cache: Optional[LlamaCache] = None + self.cache: Optional[BaseLlamaCache] = None self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) @@ -271,12 +288,12 @@ class Llama: if self.lora_path: if llama_cpp.llama_apply_lora_from_file( - self.ctx, - llama_cpp.c_char_p(self.lora_path.encode("utf-8")), - llama_cpp.c_char_p(self.lora_base.encode("utf-8")) - if self.lora_base is not None - else llama_cpp.c_char_p(0), - llama_cpp.c_int(self.n_threads), + self.ctx, + llama_cpp.c_char_p(self.lora_path.encode("utf-8")), + llama_cpp.c_char_p(self.lora_base.encode("utf-8")) + if self.lora_base is not None + else llama_cpp.c_char_p(0), + llama_cpp.c_int(self.n_threads), ): raise RuntimeError( f"Failed to apply LoRA from lora path: {self.lora_path} to base path: {self.lora_base}" @@ -363,7 +380,7 @@ class Llama: ) return output - def set_cache(self, cache: Optional[LlamaCache]): + def set_cache(self, cache: Optional[BaseLlamaCache]): """Set the cache. Args: @@ -387,7 +404,7 @@ class Llama: assert self.ctx is not None n_ctx = self._n_ctx for i in range(0, len(tokens), self.n_batch): - batch = tokens[i: min(len(tokens), i + self.n_batch)] + batch = tokens[i : min(len(tokens), i + self.n_batch)] n_past = min(n_ctx - len(batch), len(self._input_ids)) n_tokens = len(batch) return_code = llama_cpp.llama_eval( @@ -409,28 +426,28 @@ class Llama: n_vocab = self._n_vocab cols = n_vocab logits_view = llama_cpp.llama_get_logits(self.ctx) - logits = [logits_view[i * cols: (i + 1) * cols] for i in range(rows)] + logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)] self.eval_logits.extend(logits) self._scores: npt.NDArray[np.single] = np.concatenate( (self._scores, np.array(logits, dtype=np.single)), axis=0 ) def _sample( - self, - last_n_tokens_data, # type: llama_cpp.Array[llama_cpp.llama_token] - last_n_tokens_size: llama_cpp.c_int, - top_k: llama_cpp.c_int, - top_p: llama_cpp.c_float, - temp: llama_cpp.c_float, - tfs_z: llama_cpp.c_float, - repeat_penalty: llama_cpp.c_float, - frequency_penalty: llama_cpp.c_float, - presence_penalty: llama_cpp.c_float, - mirostat_mode: llama_cpp.c_int, - mirostat_tau: llama_cpp.c_float, - mirostat_eta: llama_cpp.c_float, - penalize_nl: bool = True, - logits_processor: Optional[LogitsProcessorList] = None, + self, + last_n_tokens_data, # type: llama_cpp.Array[llama_cpp.llama_token] + last_n_tokens_size: llama_cpp.c_int, + top_k: llama_cpp.c_int, + top_p: llama_cpp.c_float, + temp: llama_cpp.c_float, + tfs_z: llama_cpp.c_float, + repeat_penalty: llama_cpp.c_float, + frequency_penalty: llama_cpp.c_float, + presence_penalty: llama_cpp.c_float, + mirostat_mode: llama_cpp.c_int, + mirostat_tau: llama_cpp.c_float, + mirostat_eta: llama_cpp.c_float, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, ): assert self.ctx is not None assert len(self.eval_logits) > 0 @@ -550,19 +567,19 @@ class Llama: ) def sample( - self, - top_k: int = 40, - top_p: float = 0.95, - temp: float = 0.80, - repeat_penalty: float = 1.1, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_eta: float = 0.1, - mirostat_tau: float = 5.0, - penalize_nl: bool = True, - logits_processor: Optional[LogitsProcessorList] = None, + self, + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_eta: float = 0.1, + mirostat_tau: float = 5.0, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, ): """Sample a token from the model. @@ -578,7 +595,7 @@ class Llama: assert self.ctx is not None last_n_tokens_data = [llama_cpp.llama_token(0)] * max( 0, self.last_n_tokens_size - len(self._input_ids) - ) + self._input_ids[-self.last_n_tokens_size:].tolist() + ) + self._input_ids[-self.last_n_tokens_size :].tolist() return self._sample( last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)( *last_n_tokens_data @@ -599,21 +616,21 @@ class Llama: ) def generate( - self, - tokens: Sequence[int], - top_k: int = 40, - top_p: float = 0.95, - temp: float = 0.80, - repeat_penalty: float = 1.1, - reset: bool = True, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, + self, + tokens: Sequence[int], + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + reset: bool = True, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -676,7 +693,7 @@ class Llama: logits_processor=logits_processor, ) if stopping_criteria is not None and stopping_criteria( - self._input_ids.tolist(), self._scores[-1, :].tolist() + self._input_ids.tolist(), self._scores[-1, :].tolist() ): return tokens_or_none = yield token @@ -685,7 +702,7 @@ class Llama: tokens.extend(tokens_or_none) def create_embedding( - self, input: Union[str, List[str]], model: Optional[str] = None + self, input: Union[str, List[str]], model: Optional[str] = None ) -> Embedding: """Embed a string. @@ -720,8 +737,8 @@ class Llama: n_tokens = len(tokens) total_tokens += n_tokens embedding = llama_cpp.llama_get_embeddings(self.ctx)[ - : llama_cpp.llama_n_embd(self.ctx) - ] + : llama_cpp.llama_n_embd(self.ctx) + ] data.append( { @@ -755,27 +772,27 @@ class Llama: return list(map(float, self.create_embedding(input)["data"][0]["embedding"])) def _create_completion( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 16, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 16, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None @@ -827,19 +844,19 @@ class Llama: finish_reason = "length" multibyte_fix = 0 for token in self.generate( - prompt_tokens, - top_k=top_k, - top_p=top_p, - temp=temperature, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - repeat_penalty=repeat_penalty, - stopping_criteria=stopping_criteria, - logits_processor=logits_processor, + prompt_tokens, + top_k=top_k, + top_p=top_p, + temp=temperature, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + repeat_penalty=repeat_penalty, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ): if token == self._token_eos: text = self.detokenize(completion_tokens) @@ -891,7 +908,7 @@ class Llama: token_end_position += len(self.detokenize([token])) # Check if stop sequence is in the token if token_end_position >= ( - remaining_length - first_stop_position - 1 + remaining_length - first_stop_position - 1 ): break logprobs_or_none: Optional[CompletionLogprobs] = None @@ -952,7 +969,7 @@ class Llama: break if stopping_criteria is not None and stopping_criteria( - self._input_ids.tolist(), self._scores[-1, :].tolist() + self._input_ids.tolist(), self._scores[-1, :].tolist() ): text = self.detokenize(completion_tokens) finish_reason = "stop" @@ -1017,8 +1034,8 @@ class Llama: "choices": [ { "text": last_text[ - : len(last_text) - (token_end_position - end) - ].decode("utf-8", errors="ignore"), + : len(last_text) - (token_end_position - end) + ].decode("utf-8", errors="ignore"), "index": 0, "logprobs": logprobs_or_none, "finish_reason": finish_reason, @@ -1049,6 +1066,7 @@ class Llama: if self.verbose: print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() + print("Llama._create_completion: cache saved", file=sys.stderr) return if self.cache: @@ -1084,10 +1102,10 @@ class Llama: for token in all_tokens ] all_logprobs = [ - Llama.logits_to_logprobs(row.tolist()) for row in self._scores - ][token_offset:] + Llama.logits_to_logprobs(row.tolist()) for row in self._scores + ][token_offset:] for token, token_str, logprobs_token in zip( - all_tokens, all_token_strs, all_logprobs + all_tokens, all_token_strs, all_logprobs ): text_offsets.append(text_offset) text_offset += len(token_str) @@ -1138,27 +1156,27 @@ class Llama: } def create_completion( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 128, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 128, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -1211,27 +1229,27 @@ class Llama: return completion def __call__( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 128, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 128, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -1279,7 +1297,7 @@ class Llama: ) def _convert_text_completion_to_chat( - self, completion: Completion + self, completion: Completion ) -> ChatCompletion: return { "id": "chat" + completion["id"], @@ -1300,8 +1318,8 @@ class Llama: } def _convert_text_completion_chunks_to_chat( - self, - chunks: Iterator[CompletionChunk], + self, + chunks: Iterator[CompletionChunk], ) -> Iterator[ChatCompletionChunk]: for i, chunk in enumerate(chunks): if i == 0: @@ -1337,22 +1355,22 @@ class Llama: } def create_chat_completion( - self, - messages: List[ChatCompletionMessage], - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - max_tokens: int = 256, - presence_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, + self, + messages: List[ChatCompletionMessage], + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + max_tokens: int = 256, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -1453,9 +1471,17 @@ class Llama: def save_state(self) -> LlamaState: assert self.ctx is not None + if self.verbose: + print("Llama.save_state: saving llama state", file=sys.stderr) state_size = llama_cpp.llama_get_state_size(self.ctx) + if self.verbose: + print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr) llama_state = (llama_cpp.c_uint8 * int(state_size))() + if self.verbose: + print("Llama.save_state: allocated state", file=sys.stderr) n_bytes = llama_cpp.llama_copy_state_data(self.ctx, llama_state) + if self.verbose: + print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr) if int(n_bytes) > int(state_size): raise RuntimeError("Failed to copy llama state data") llama_state_compact = (llama_cpp.c_uint8 * int(n_bytes))() diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 23382e1..f70d8f0 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -58,6 +58,10 @@ class Settings(BaseSettings): default=False, description="Use a cache to reduce processing times for evaluated prompts.", ) + cache_type: Literal["ram", "disk"] = Field( + default="ram", + description="The type of cache to use. Only used if cache is True.", + ) cache_size: int = Field( default=2 << 30, description="The size of the cache in bytes. Only used if cache is True.", @@ -108,6 +112,11 @@ def create_app(settings: Optional[Settings] = None): verbose=settings.verbose, ) if settings.cache: + if settings.cache_type == "disk": + cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) + else: + cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) + cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) llama.set_cache(cache) From 312f6eb144aff22b3b458a0b07742d178f6f0355 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 16:46:55 -0400 Subject: [PATCH 346/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 5c64a09..72ff528 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 5c64a0952ee58b2d742ee84e8e3d43cce5d366db +Subproject commit 72ff5282bf0388c60821f504c4c8cc2b1f491aa6 From 202ed4464bbed08166550e733c5ae71d499a8adb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 20:33:30 -0400 Subject: [PATCH 347/443] Update gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 79093b4..fb0b6c9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.vscode/ + _skbuild/ .envrc @@ -11,6 +13,8 @@ __pycache__/ # C extensions *.so +*.dylib +*.dll # Distribution / packaging .Python From 734545677933fe36c84401e4c8b3918eb0293019 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 21:49:42 -0400 Subject: [PATCH 348/443] Migrate to scikit-build-core --- .github/workflows/build-and-release.yaml | 12 ++--- .github/workflows/build-docker.yaml | 39 ----------------- .github/workflows/publish-to-test.yaml | 30 ------------- .github/workflows/publish.yaml | 4 +- .github/workflows/test.yaml | 18 ++++---- CMakeLists.txt | 2 +- Makefile | 23 ++++++---- pyproject.toml | 56 +++++++++++++++++------- setup.py | 32 -------------- 9 files changed, 73 insertions(+), 143 deletions(-) delete mode 100644 .github/workflows/build-docker.yaml delete mode 100644 .github/workflows/publish-to-test.yaml delete mode 100644 setup.py diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 2c0ca4a..1a9c192 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -22,14 +22,15 @@ jobs: - uses: actions/setup-python@v3 - name: Install cibuildwheel - run: python -m pip install cibuildwheel==2.12.1 + run: python3 -m pip install cibuildwheel==2.12.1 - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools + python3 -m pip install --upgrade pip + python3 -m pip install --verbose --editable . - name: Build wheels - run: python -m cibuildwheel --output-dir wheelhouse + run: python3 -m cibuildwheel --output-dir wheelhouse - uses: actions/upload-artifact@v3 with: @@ -46,10 +47,11 @@ jobs: - uses: actions/setup-python@v3 - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools + python3 -m pip install --upgrade pip build + python3 -m pip install --verbose --editable . - name: Build source distribution run: | - python setup.py sdist + python3 -m build --sdist - uses: actions/upload-artifact@v3 with: path: ./dist/*.tar.gz diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml deleted file mode 100644 index 16b00a2..0000000 --- a/.github/workflows/build-docker.yaml +++ /dev/null @@ -1,39 +0,0 @@ -name: Build Docker - -on: workflow_dispatch - -permissions: - contents: write - packages: write - -jobs: - docker: - name: Build and push Docker image - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: "true" - - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build and push - uses: docker/build-push-action@v4 - with: - context: . - push: true # push to registry - pull: true # always fetch the latest base images - platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 - tags: ghcr.io/abetlen/llama-cpp-python:latest \ No newline at end of file diff --git a/.github/workflows/publish-to-test.yaml b/.github/workflows/publish-to-test.yaml deleted file mode 100644 index 5a9f339..0000000 --- a/.github/workflows/publish-to-test.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ - -name: Publish to TestPyPI - -on: workflow_dispatch - -jobs: - build-n-publish: - name: Build and publish - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - with: - submodules: "true" - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.8" - - name: Install dependencies - run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools - - name: Build source distribution - run: | - python setup.py sdist - - name: Publish to Test PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.TEST_PYPI_API_TOKEN }} - repository-url: https://test.pypi.org/legacy/ \ No newline at end of file diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index ddefd68..9a84fea 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -19,10 +19,10 @@ jobs: python-version: "3.8" - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools + python3 -m pip install --upgrade pip build - name: Build source distribution run: | - python setup.py sdist + python3 -m build --sdist - name: Publish distribution to PyPI # TODO: move to tag based releases # if: startsWith(github.ref, 'refs/tags') diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 56524e0..8dcd3ef 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -26,11 +26,11 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn - pip install . -v + python3 -m pip install --upgrade pip + python3 -m pip install --verbose --editable .[server,test] - name: Test with pytest run: | - pytest + python3 -m pytest build-windows: @@ -49,11 +49,11 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn - pip install . -v + python3 -m pip install --upgrade pip + python3 -m pip install --verbose --editable .[server,test] - name: Test with pytest run: | - pytest + python3 -m pytest build-macos: @@ -72,8 +72,8 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn - pip install . -v + python3 -m pip install --upgrade pip + python3 -m pip install --verbose --editable .[server,test] - name: Test with pytest run: | - pytest \ No newline at end of file + python3 -m pytest \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index e5fac6a..3255053 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,7 @@ if (UNIX AND NOT FORCE_CMAKE) ) install( FILES ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so - DESTINATION llama_cpp + DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp ) else() set(BUILD_SHARED_LIBS "On") diff --git a/Makefile b/Makefile index d2f38da..3443698 100644 --- a/Makefile +++ b/Makefile @@ -1,30 +1,38 @@ update: poetry install + python3 -m pip install --upgrade pip git submodule update --init --recursive update.vendor: cd vendor/llama.cpp && git pull origin master build: - python3 setup.py develop + python3 -m pip install --upgrade pip + python3 -m pip install --verbose --editable . build.cuda: - CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop + python3 -m pip install --upgrade pip + CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 -m pip install --verbose --editable . build.opencl: - CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop + python3 -m pip install --upgrade pip + CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 -m pip install --verbose --editable . build.openblas: - CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop + python3 -m pip install --upgrade pip + CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 -m pip install --verbose --editable . build.blis: - CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop + python3 -m pip install --upgrade pip + CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 -m pip install --verbose --editable . build.metal: - CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 setup.py develop + python3 -m pip install --upgrade pip + CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 -m pip install --verbose --editable . build.sdist: - python3 setup.py sdist + python3 -m pip install --upgrade pip build + python3 -m build --sdist deploy.pypi: python3 -m twine upload dist/* @@ -36,7 +44,6 @@ deploy.gh-docs: clean: - cd vendor/llama.cpp && make clean - cd vendor/llama.cpp && rm libllama.so - - rm -rf _skbuild - rm llama_cpp/*.so - rm llama_cpp/*.dylib - rm llama_cpp/*.dll diff --git a/pyproject.toml b/pyproject.toml index 05c9271..3017d5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,16 +1,47 @@ -[tool.poetry] +[build-system] +requires = [ + "scikit-build-core>=0.4.4", + "cmake>=3.18", + "ninja", +] +build-backend = "scikit_build_core.build" + +[project] name = "llama_cpp_python" version = "0.1.59" description = "Python bindings for the llama.cpp library" -authors = ["Andrei Betlen "] -license = "MIT" readme = "README.md" -homepage = "https://github.com/abetlen/llama-cpp-python" -repository = "https://github.com/abetlen/llama-cpp-python" -packages = [{include = "llama_cpp"}] -include = [ - "LICENSE.md", +license = { text = "MIT" } +authors = [ + { name = "Andrei Betlen", email = "abetlen@gmail.com" }, ] +requires-python = ">=3.7" +dependencies = [ + "typing-extensions>=4.5.0", + "numpy>=1.20.0", + "diskcache>=5.6.1", +] +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] + +[tool.scikit-build] +wheel.packages = ["llama_cpp", "llama_cpp.server"] +wheel.expand-macos-universal-tags = true +cmake.verbose = true + +[project.optional-dependencies] +server = [ + "uvicorn>=0.21.1", + "fastapi>=0.95.0", + "sse-starlette>=1.3.3", +] +test = ["pytest"] [tool.poetry.dependencies] python = "^3.8.1" @@ -33,12 +64,3 @@ scikit-build = "0.17.6" [tool.poetry.extras] server = ["uvicorn", "fastapi", "sse-starlette"] - -[build-system] -requires = [ - "setuptools>=42", - "scikit-build>=0.13", - "cmake>=3.18", - "ninja", -] -build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py deleted file mode 100644 index 20ebc95..0000000 --- a/setup.py +++ /dev/null @@ -1,32 +0,0 @@ -from skbuild import setup - -from pathlib import Path - -this_directory = Path(__file__).parent -long_description = (this_directory / "README.md").read_text(encoding="utf-8") - -setup( - name="llama_cpp_python", - description="A Python wrapper for llama.cpp", - long_description=long_description, - long_description_content_type="text/markdown", - version="0.1.59", - author="Andrei Betlen", - author_email="abetlen@gmail.com", - license="MIT", - package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"}, - packages=["llama_cpp", "llama_cpp.server"], - install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], - extras_require={ - "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], - }, - python_requires=">=3.7", - classifiers=[ - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - ], -) From c12138f7bd39fed54d8df14fd2333b2ad8e79971 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 21:53:38 -0400 Subject: [PATCH 349/443] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9546b1..e875183 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- (build-system) Migrate from scikit-build to scikit-build-core + ## [v0.1.59] ### Added From 43854e6a8336bf055fd74246c2b2baa198c14a92 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 21:55:42 -0400 Subject: [PATCH 350/443] Update server dependencies --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3017d5c..86f7af6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,9 +37,9 @@ cmake.verbose = true [project.optional-dependencies] server = [ - "uvicorn>=0.21.1", - "fastapi>=0.95.0", - "sse-starlette>=1.3.3", + "uvicorn>=0.22.0", + "fastapi>=0.96.0", + "sse-starlette>=1.6.1", ] test = ["pytest"] From 1d6bdf8db651a0c078321b77d4490191340c9b44 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 21:59:58 -0400 Subject: [PATCH 351/443] Update server dependencies --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 86f7af6..4fb3c8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,9 +37,9 @@ cmake.verbose = true [project.optional-dependencies] server = [ - "uvicorn>=0.22.0", - "fastapi>=0.96.0", - "sse-starlette>=1.6.1", + "uvicorn", + "fastapi", + "sse-starlette", ] test = ["pytest"] From 146ca2c59f0ec1ed25626270fe869b4c714ecff6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 22:03:24 -0400 Subject: [PATCH 352/443] Add missing httpx --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4fb3c8b..2946143 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ server = [ "fastapi", "sse-starlette", ] -test = ["pytest"] +test = ["pytest", "httpx"] [tool.poetry.dependencies] python = "^3.8.1" From b025a859aea32d2bdf40758f0bc8927f33baae4e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Jun 2023 22:11:01 -0400 Subject: [PATCH 353/443] Add full path to shared library installation path --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3255053..4760a74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,9 +25,9 @@ else() add_subdirectory(vendor/llama.cpp) install( TARGETS llama - LIBRARY DESTINATION llama_cpp - RUNTIME DESTINATION llama_cpp - ARCHIVE DESTINATION llama_cpp - FRAMEWORK DESTINATION llama_cpp + LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp ) endif() From 556c7edf47352036f7d876534a2b3ce4e1586a36 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Jun 2023 10:57:36 -0400 Subject: [PATCH 354/443] Truncate max_tokens if it exceeds context length --- llama_cpp/llama.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 05994b6..4b6ce8c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -811,9 +811,16 @@ class Llama: if self.verbose: llama_cpp.llama_reset_timings(self.ctx) - if len(prompt_tokens) + max_tokens > self._n_ctx: + if len(prompt_tokens) > self._n_ctx: raise ValueError(f"Requested tokens exceed context window of {self._n_ctx}") + # Truncate max_tokens if requested tokens would exceed the context window + max_tokens = ( + max_tokens + if max_tokens + len(prompt_tokens) < self._n_ctx + else (self._n_ctx - len(prompt_tokens)) + ) + if stop != []: stop_sequences = [s.encode("utf-8") for s in stop] else: From f2a54ecb4cdd936fc81caca80f84ce3536a47c4f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Jun 2023 11:01:42 -0400 Subject: [PATCH 355/443] Update CHANGELOG --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e875183..8ea8d90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +#### Addeed + - (build-system) Migrate from scikit-build to scikit-build-core +### Fixed +- Truncate max_tokens in create_completion so requested tokens doesn't exceed context size. + ## [v0.1.59] ### Added From be0403da98b7c035c1942e05852b3583463f0241 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Jun 2023 11:09:32 -0400 Subject: [PATCH 356/443] Add missing poetry sections to pyproject.toml --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 2946143..f10b0cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,12 @@ server = [ ] test = ["pytest", "httpx"] +[tool.poetry] +name = "llama_cpp_python" +version = "0.1.59" +description = "Python bindings for the llama.cpp library" +authors = ["Andrei Betlen "] + [tool.poetry.dependencies] python = "^3.8.1" typing-extensions = "^4.6.3" From 0da655b3be172b7fa91d1eca09aeccfc4dc8e458 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Jun 2023 11:10:24 -0400 Subject: [PATCH 357/443] Temporarily disable cache until save state bug is fixed. --- CHANGELOG.md | 2 ++ llama_cpp/llama.py | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ea8d90..da75478 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - (build-system) Migrate from scikit-build to scikit-build-core ### Fixed + - Truncate max_tokens in create_completion so requested tokens doesn't exceed context size. +- Temporarily disable cache for completion requests ## [v0.1.59] diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4b6ce8c..02fe774 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -831,7 +831,9 @@ class Llama: "logprobs is not supported for models created with logits_all=False" ) - if self.cache: + # Temporarily disable usage of the cache + # See: https://github.com/abetlen/llama-cpp-python/issues/348#issuecomment-1583072408 + if self.cache and False: try: cache_item = self.cache[prompt_tokens] cache_prefix_len = Llama.longest_token_prefix( @@ -1069,14 +1071,14 @@ class Llama: } ], } - if self.cache: + if self.cache and False: if self.verbose: print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() print("Llama._create_completion: cache saved", file=sys.stderr) return - if self.cache: + if self.cache and False: if self.verbose: print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() From dd7c7bf80b4234f8484aa25ae7c62132c32970e7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Jun 2023 11:52:07 -0400 Subject: [PATCH 358/443] Bump version --- CHANGELOG.md | 2 ++ pyproject.toml | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da75478..f391f17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.60] + #### Addeed - (build-system) Migrate from scikit-build to scikit-build-core diff --git a/pyproject.toml b/pyproject.toml index f10b0cc..f59e1a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "scikit_build_core.build" [project] name = "llama_cpp_python" -version = "0.1.59" +version = "0.1.60" description = "Python bindings for the llama.cpp library" readme = "README.md" license = { text = "MIT" } @@ -45,7 +45,7 @@ test = ["pytest", "httpx"] [tool.poetry] name = "llama_cpp_python" -version = "0.1.59" +version = "0.1.60" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] From c0f7e739c98a1012342da0ff5cf6fe6209d70368 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Jun 2023 12:39:09 -0400 Subject: [PATCH 359/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 72ff528..98ed165 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 72ff5282bf0388c60821f504c4c8cc2b1f491aa6 +Subproject commit 98ed16557432d7a5179c57eddcc3a08a7ae6d54d From eb7645b3ba84e182a903663d68c0b4864b670f9b Mon Sep 17 00:00:00 2001 From: Tanner Hobson Date: Fri, 9 Jun 2023 13:13:08 -0400 Subject: [PATCH 360/443] Add support for logit_bias and logit_bias_type parameters --- llama_cpp/llama.py | 2 ++ llama_cpp/server/app.py | 53 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 02fe774..197511c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1380,6 +1380,7 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -1421,6 +1422,7 @@ class Llama: mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, model=model, + logits_processor=logits_processor, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index f70d8f0..a6194f5 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -249,13 +249,14 @@ class CreateCompletionRequest(BaseModel): ) presence_penalty: Optional[float] = presence_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field + logit_bias: Optional[Dict[str, float]] = Field(None) + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) # ignored or currently unsupported model: Optional[str] = model_field n: Optional[int] = 1 logprobs: Optional[int] = Field(None) best_of: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(None) user: Optional[str] = Field(None) # llama.cpp specific parameters @@ -274,6 +275,39 @@ class CreateCompletionRequest(BaseModel): CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) +def make_logit_bias_processor( + llama: llama_cpp.Llama, + logit_bias: Dict[str, float], + logit_bias_type: Optional[Literal["input_ids", "tokens"]], +): + if logit_bias_type is None: + logit_bias_type = "input_ids" + + to_bias: Dict[int, float] = {} + if logit_bias_type == "input_ids": + for input_id, score in logit_bias.items(): + input_id = int(input_id) + to_bias[input_id] = score + + elif logit_bias_type == "tokens": + for token, score in logit_bias.items(): + token = token.encode('utf-8') + for input_id in llama.tokenize(token, add_bos=False): + to_bias[input_id] = score + + def logit_bias_processor( + input_ids: List[int], + scores: List[float], + ) -> List[float]: + new_scores = [None] * len(scores) + for input_id, score in enumerate(scores): + new_scores[input_id] = score + to_bias.get(input_id, 0.0) + + return new_scores + + return logit_bias_processor + + @router.post( "/v1/completions", response_model=CreateCompletionResponse, @@ -291,9 +325,16 @@ async def create_completion( "n", "best_of", "logit_bias", + "logit_bias_type", "user", } kwargs = body.dict(exclude=exclude) + + if body.logit_bias is not None: + kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ + make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type), + ]) + if body.stream: send_chan, recv_chan = anyio.create_memory_object_stream(10) @@ -372,11 +413,12 @@ class CreateChatCompletionRequest(BaseModel): stream: bool = stream_field presence_penalty: Optional[float] = presence_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field + logit_bias: Optional[Dict[str, float]] = Field(None) + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) # ignored or currently unsupported model: Optional[str] = model_field n: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(None) user: Optional[str] = Field(None) # llama.cpp specific parameters @@ -413,9 +455,16 @@ async def create_chat_completion( exclude = { "n", "logit_bias", + "logit_bias_type", "user", } kwargs = body.dict(exclude=exclude) + + if body.logit_bias is not None: + kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ + make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type), + ]) + if body.stream: send_chan, recv_chan = anyio.create_memory_object_stream(10) From a55355286887e253d596aba69ed092be8c7ce334 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Jun 2023 16:52:17 -0400 Subject: [PATCH 361/443] Add project urls to pyproject --- pyproject.toml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index f59e1a9..d703473 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,13 @@ classifiers = [ "Programming Language :: Python :: 3.11", ] +[project.urls] +Homepage = "https://github.com/abetlen/llama-cpp-python" +Documentation = "https://abetlen.github.io/llama-cpp-python" +Discussions = "https://github.com/abetlen/llama-cpp-python/discussions" +Issues = "https://github.com/abetlen/llama-cpp-python/issues" +Changelog = "https://github.com/abetlen/llama-cpp-python/blob/main/CHANGELOG.md" + [tool.scikit-build] wheel.packages = ["llama_cpp", "llama_cpp.server"] wheel.expand-macos-universal-tags = true From 2fdd873125e907ff16470899dc0c925c839d562d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Jun 2023 16:52:40 -0400 Subject: [PATCH 362/443] Add gihub action to test published pypi version of package --- .github/workflows/test-pypi.yaml | 70 ++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 .github/workflows/test-pypi.yaml diff --git a/.github/workflows/test-pypi.yaml b/.github/workflows/test-pypi.yaml new file mode 100644 index 0000000..5de1837 --- /dev/null +++ b/.github/workflows/test-pypi.yaml @@ -0,0 +1,70 @@ +name: Tests for PyPI package + +on: + pull_request: + branches: + - main + push: + branches: + - main + +jobs: + build-linux: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + + steps: + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install --verbose llama-cpp-python[server,test] + - name: Test with pytest + run: | + python3 -m pytest + + build-windows: + + runs-on: windows-latest + strategy: + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + + steps: + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install --verbose llama-cpp-python[server,test] + - name: Test with pytest + run: | + python3 -m pytest + + build-macos: + + runs-on: macos-latest + strategy: + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + + steps: + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install --verbose llama-cpp-python[server,test] + - name: Test with pytest + run: | + python3 -m pytest \ No newline at end of file From d4aed351e3a2b1a5ea303dae2b0d5853cfe30e96 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Jun 2023 17:08:42 -0400 Subject: [PATCH 363/443] Run on workflow_dispatch --- .github/workflows/test-pypi.yaml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/test-pypi.yaml b/.github/workflows/test-pypi.yaml index 5de1837..542d338 100644 --- a/.github/workflows/test-pypi.yaml +++ b/.github/workflows/test-pypi.yaml @@ -1,12 +1,6 @@ name: Tests for PyPI package -on: - pull_request: - branches: - - main - push: - branches: - - main +on: workflow_dispatch jobs: build-linux: From 3c6e1b6c42f3f6ad8af11142950a29f13d17db48 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Jun 2023 19:08:15 -0400 Subject: [PATCH 364/443] Update to smoketest --- .github/workflows/test-pypi.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-pypi.yaml b/.github/workflows/test-pypi.yaml index 542d338..38f2d92 100644 --- a/.github/workflows/test-pypi.yaml +++ b/.github/workflows/test-pypi.yaml @@ -21,7 +21,7 @@ jobs: python3 -m pip install --verbose llama-cpp-python[server,test] - name: Test with pytest run: | - python3 -m pytest + python3 -c "import llama_cpp" build-windows: @@ -41,7 +41,7 @@ jobs: python3 -m pip install --verbose llama-cpp-python[server,test] - name: Test with pytest run: | - python3 -m pytest + python3 -c "import llama_cpp" build-macos: @@ -61,4 +61,4 @@ jobs: python3 -m pip install --verbose llama-cpp-python[server,test] - name: Test with pytest run: | - python3 -m pytest \ No newline at end of file + python3 -c "import llama_cpp" \ No newline at end of file From e3542b6627db9b01fb01a3fa605c686307129f4e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Jun 2023 23:23:16 -0400 Subject: [PATCH 365/443] Revert "Merge pull request #350 from abetlen/migrate-to-scikit-build-core" This reverts commit fb2c5f7fd94e6e7abc714e910a3ee29f824746d0, reversing changes made to 202ed4464bbed08166550e733c5ae71d499a8adb. --- .github/workflows/build-and-release.yaml | 12 ++--- .github/workflows/build-docker.yaml | 39 +++++++++++++++ .github/workflows/publish-to-test.yaml | 30 +++++++++++ .github/workflows/publish.yaml | 4 +- .github/workflows/test.yaml | 18 +++---- CHANGELOG.md | 4 +- CMakeLists.txt | 10 ++-- Makefile | 23 +++------ pyproject.toml | 63 +++++++----------------- setup.py | 32 ++++++++++++ 10 files changed, 149 insertions(+), 86 deletions(-) create mode 100644 .github/workflows/build-docker.yaml create mode 100644 .github/workflows/publish-to-test.yaml create mode 100644 setup.py diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 1a9c192..2c0ca4a 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -22,15 +22,14 @@ jobs: - uses: actions/setup-python@v3 - name: Install cibuildwheel - run: python3 -m pip install cibuildwheel==2.12.1 + run: python -m pip install cibuildwheel==2.12.1 - name: Install dependencies run: | - python3 -m pip install --upgrade pip - python3 -m pip install --verbose --editable . + python -m pip install --upgrade pip pytest cmake scikit-build setuptools - name: Build wheels - run: python3 -m cibuildwheel --output-dir wheelhouse + run: python -m cibuildwheel --output-dir wheelhouse - uses: actions/upload-artifact@v3 with: @@ -47,11 +46,10 @@ jobs: - uses: actions/setup-python@v3 - name: Install dependencies run: | - python3 -m pip install --upgrade pip build - python3 -m pip install --verbose --editable . + python -m pip install --upgrade pip pytest cmake scikit-build setuptools - name: Build source distribution run: | - python3 -m build --sdist + python setup.py sdist - uses: actions/upload-artifact@v3 with: path: ./dist/*.tar.gz diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml new file mode 100644 index 0000000..16b00a2 --- /dev/null +++ b/.github/workflows/build-docker.yaml @@ -0,0 +1,39 @@ +name: Build Docker + +on: workflow_dispatch + +permissions: + contents: write + packages: write + +jobs: + docker: + name: Build and push Docker image + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: "true" + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v4 + with: + context: . + push: true # push to registry + pull: true # always fetch the latest base images + platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 + tags: ghcr.io/abetlen/llama-cpp-python:latest \ No newline at end of file diff --git a/.github/workflows/publish-to-test.yaml b/.github/workflows/publish-to-test.yaml new file mode 100644 index 0000000..5a9f339 --- /dev/null +++ b/.github/workflows/publish-to-test.yaml @@ -0,0 +1,30 @@ +# Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ + +name: Publish to TestPyPI + +on: workflow_dispatch + +jobs: + build-n-publish: + name: Build and publish + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + with: + submodules: "true" + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.8" + - name: Install dependencies + run: | + python -m pip install --upgrade pip pytest cmake scikit-build setuptools + - name: Build source distribution + run: | + python setup.py sdist + - name: Publish to Test PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.TEST_PYPI_API_TOKEN }} + repository-url: https://test.pypi.org/legacy/ \ No newline at end of file diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 9a84fea..ddefd68 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -19,10 +19,10 @@ jobs: python-version: "3.8" - name: Install dependencies run: | - python3 -m pip install --upgrade pip build + python -m pip install --upgrade pip pytest cmake scikit-build setuptools - name: Build source distribution run: | - python3 -m build --sdist + python setup.py sdist - name: Publish distribution to PyPI # TODO: move to tag based releases # if: startsWith(github.ref, 'refs/tags') diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 8dcd3ef..56524e0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -26,11 +26,11 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python3 -m pip install --upgrade pip - python3 -m pip install --verbose --editable .[server,test] + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn + pip install . -v - name: Test with pytest run: | - python3 -m pytest + pytest build-windows: @@ -49,11 +49,11 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python3 -m pip install --upgrade pip - python3 -m pip install --verbose --editable .[server,test] + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn + pip install . -v - name: Test with pytest run: | - python3 -m pytest + pytest build-macos: @@ -72,8 +72,8 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python3 -m pip install --upgrade pip - python3 -m pip install --verbose --editable .[server,test] + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn + pip install . -v - name: Test with pytest run: | - python3 -m pytest \ No newline at end of file + pytest \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index f391f17..fc1022c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,9 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.1.60] -#### Addeed +### NOTE -- (build-system) Migrate from scikit-build to scikit-build-core +- This release was deleted due to a bug with the packaging system that caused pip installations to fail. ### Fixed diff --git a/CMakeLists.txt b/CMakeLists.txt index 4760a74..e5fac6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,16 +18,16 @@ if (UNIX AND NOT FORCE_CMAKE) ) install( FILES ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so - DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + DESTINATION llama_cpp ) else() set(BUILD_SHARED_LIBS "On") add_subdirectory(vendor/llama.cpp) install( TARGETS llama - LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + LIBRARY DESTINATION llama_cpp + RUNTIME DESTINATION llama_cpp + ARCHIVE DESTINATION llama_cpp + FRAMEWORK DESTINATION llama_cpp ) endif() diff --git a/Makefile b/Makefile index 3443698..d2f38da 100644 --- a/Makefile +++ b/Makefile @@ -1,38 +1,30 @@ update: poetry install - python3 -m pip install --upgrade pip git submodule update --init --recursive update.vendor: cd vendor/llama.cpp && git pull origin master build: - python3 -m pip install --upgrade pip - python3 -m pip install --verbose --editable . + python3 setup.py develop build.cuda: - python3 -m pip install --upgrade pip - CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 -m pip install --verbose --editable . + CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop build.opencl: - python3 -m pip install --upgrade pip - CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 -m pip install --verbose --editable . + CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop build.openblas: - python3 -m pip install --upgrade pip - CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 -m pip install --verbose --editable . + CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop build.blis: - python3 -m pip install --upgrade pip - CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 -m pip install --verbose --editable . + CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop build.metal: - python3 -m pip install --upgrade pip - CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 -m pip install --verbose --editable . + CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 setup.py develop build.sdist: - python3 -m pip install --upgrade pip build - python3 -m build --sdist + python3 setup.py sdist deploy.pypi: python3 -m twine upload dist/* @@ -44,6 +36,7 @@ deploy.gh-docs: clean: - cd vendor/llama.cpp && make clean - cd vendor/llama.cpp && rm libllama.so + - rm -rf _skbuild - rm llama_cpp/*.so - rm llama_cpp/*.dylib - rm llama_cpp/*.dll diff --git a/pyproject.toml b/pyproject.toml index d703473..12626e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,54 +1,16 @@ -[build-system] -requires = [ - "scikit-build-core>=0.4.4", - "cmake>=3.18", - "ninja", -] -build-backend = "scikit_build_core.build" - -[project] +[tool.poetry] name = "llama_cpp_python" version = "0.1.60" description = "Python bindings for the llama.cpp library" +authors = ["Andrei Betlen "] +license = "MIT" readme = "README.md" -license = { text = "MIT" } -authors = [ - { name = "Andrei Betlen", email = "abetlen@gmail.com" }, +homepage = "https://github.com/abetlen/llama-cpp-python" +repository = "https://github.com/abetlen/llama-cpp-python" +packages = [{include = "llama_cpp"}] +include = [ + "LICENSE.md", ] -requires-python = ">=3.7" -dependencies = [ - "typing-extensions>=4.5.0", - "numpy>=1.20.0", - "diskcache>=5.6.1", -] -classifiers = [ - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", -] - -[project.urls] -Homepage = "https://github.com/abetlen/llama-cpp-python" -Documentation = "https://abetlen.github.io/llama-cpp-python" -Discussions = "https://github.com/abetlen/llama-cpp-python/discussions" -Issues = "https://github.com/abetlen/llama-cpp-python/issues" -Changelog = "https://github.com/abetlen/llama-cpp-python/blob/main/CHANGELOG.md" - -[tool.scikit-build] -wheel.packages = ["llama_cpp", "llama_cpp.server"] -wheel.expand-macos-universal-tags = true -cmake.verbose = true - -[project.optional-dependencies] -server = [ - "uvicorn", - "fastapi", - "sse-starlette", -] -test = ["pytest", "httpx"] [tool.poetry] name = "llama_cpp_python" @@ -77,3 +39,12 @@ scikit-build = "0.17.6" [tool.poetry.extras] server = ["uvicorn", "fastapi", "sse-starlette"] + +[build-system] +requires = [ + "setuptools>=42", + "scikit-build>=0.13", + "cmake>=3.18", + "ninja", +] +build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..20ebc95 --- /dev/null +++ b/setup.py @@ -0,0 +1,32 @@ +from skbuild import setup + +from pathlib import Path + +this_directory = Path(__file__).parent +long_description = (this_directory / "README.md").read_text(encoding="utf-8") + +setup( + name="llama_cpp_python", + description="A Python wrapper for llama.cpp", + long_description=long_description, + long_description_content_type="text/markdown", + version="0.1.59", + author="Andrei Betlen", + author_email="abetlen@gmail.com", + license="MIT", + package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"}, + packages=["llama_cpp", "llama_cpp.server"], + install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], + extras_require={ + "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], + }, + python_requires=">=3.7", + classifiers=[ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + ], +) From 6b764cab80168831ec21b30b7bac6f2fa11dace2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Jun 2023 23:25:38 -0400 Subject: [PATCH 366/443] Bump version --- CHANGELOG.md | 6 ++++++ pyproject.toml | 8 +------- setup.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc1022c..427db03 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.61] + +### Fixed + +- Fix broken pip installation + ## [0.1.60] ### NOTE diff --git a/pyproject.toml b/pyproject.toml index 12626e0..f631ec7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.60" +version = "0.1.61" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" @@ -12,12 +12,6 @@ include = [ "LICENSE.md", ] -[tool.poetry] -name = "llama_cpp_python" -version = "0.1.60" -description = "Python bindings for the llama.cpp library" -authors = ["Andrei Betlen "] - [tool.poetry.dependencies] python = "^3.8.1" typing-extensions = "^4.6.3" diff --git a/setup.py b/setup.py index 20ebc95..e0f1271 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.59", + version="0.1.61", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 6639371407e636d91eadcef40b182d89c1225b6e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 10 Jun 2023 12:17:38 -0400 Subject: [PATCH 367/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 44 ++++++++++++++++++++++++++++++++++-------- vendor/llama.cpp | 2 +- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index bb9b0e5..29136c7 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -234,6 +234,22 @@ LLAMA_FTYPE_MOSTLY_Q5_K_M = c_int(17) LLAMA_FTYPE_MOSTLY_Q6_K = c_int(18) +# // model quantization parameters +# typedef struct llama_model_quantize_params { +# int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() +# enum llama_ftype ftype; // quantize to this llama_ftype +# bool allow_requantize; // allow quantizing non-f32/f16 tensors +# bool quantize_output_tensor; // quantize output.weight +# } llama_model_quantize_params; +class llama_model_quantize_params(Structure): + _fields_ = [ + ("nthread", c_int), + ("ftype", c_int), + ("allow_requantize", c_bool), + ("quantize_output_tensor", c_bool), + ] + + # LLAMA_API struct llama_context_params llama_context_default_params(); def llama_context_default_params() -> llama_context_params: return _lib.llama_context_default_params() @@ -243,6 +259,15 @@ _lib.llama_context_default_params.argtypes = [] _lib.llama_context_default_params.restype = llama_context_params +# LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); +def llama_model_quantize_default_params() -> llama_model_quantize_params: + return _lib.llama_model_quantize_default_params() + + +_lib.llama_model_quantize_default_params.argtypes = [] +_lib.llama_model_quantize_default_params.restype = llama_model_quantize_params + + # LLAMA_API bool llama_mmap_supported(); def llama_mmap_supported() -> bool: return _lib.llama_mmap_supported() @@ -308,21 +333,24 @@ _lib.llama_free.argtypes = [llama_context_p] _lib.llama_free.restype = None -# TODO: not great API - very likely to change -# Returns 0 on success -# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given +# // Returns 0 on success # LLAMA_API int llama_model_quantize( # const char * fname_inp, # const char * fname_out, -# enum llama_ftype ftype, -# int nthread); +# const llama_model_quantize_params * params); def llama_model_quantize( - fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int + fname_inp: bytes, + fname_out: bytes, + params, # type: POINTER(llama_model_quantize_params) # type: ignore ) -> int: - return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread) + return _lib.llama_model_quantize(fname_inp, fname_out, params) -_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] +_lib.llama_model_quantize.argtypes = [ + c_char_p, + c_char_p, + POINTER(llama_model_quantize_params), +] _lib.llama_model_quantize.restype = c_int diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 98ed165..303f580 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 98ed16557432d7a5179c57eddcc3a08a7ae6d54d +Subproject commit 303f5809f1b4ec49823dbe70cacd2124ec1d0df0 From 21acd7901fd43e8c3782f49851b418048d74deca Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 10 Jun 2023 12:22:31 -0400 Subject: [PATCH 368/443] Re-enable cache --- llama_cpp/llama.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 02fe774..4b6ce8c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -831,9 +831,7 @@ class Llama: "logprobs is not supported for models created with logits_all=False" ) - # Temporarily disable usage of the cache - # See: https://github.com/abetlen/llama-cpp-python/issues/348#issuecomment-1583072408 - if self.cache and False: + if self.cache: try: cache_item = self.cache[prompt_tokens] cache_prefix_len = Llama.longest_token_prefix( @@ -1071,14 +1069,14 @@ class Llama: } ], } - if self.cache and False: + if self.cache: if self.verbose: print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() print("Llama._create_completion: cache saved", file=sys.stderr) return - if self.cache and False: + if self.cache: if self.verbose: print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() From bf2bfec615266b1a2f938ff25ed968926b27b2c3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 10 Jun 2023 12:22:39 -0400 Subject: [PATCH 369/443] Update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 427db03..cdda72f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- Metal support working +- Cache re-enabled + ## [0.1.61] ### Fixed From 890ae442b9c7160c02f1a7b3cce21db909eb1b3e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 10 Jun 2023 18:10:01 -0400 Subject: [PATCH 370/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 303f580..4de0334 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 303f5809f1b4ec49823dbe70cacd2124ec1d0df0 +Subproject commit 4de0334f5cabf4696eced2e5d6e279fdfaa6c0f2 From c1eaef329a70db994057cae9982b841b547fed72 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 10 Jun 2023 18:11:48 -0400 Subject: [PATCH 371/443] Add resource destination to cmake --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e5fac6a..788402a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,5 +29,6 @@ else() RUNTIME DESTINATION llama_cpp ARCHIVE DESTINATION llama_cpp FRAMEWORK DESTINATION llama_cpp + RESOURCE DESTINATION llama_cpp ) endif() From 6e302c6ee841259d2266c77c35de54fe0b68a79f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 10 Jun 2023 18:17:34 -0400 Subject: [PATCH 372/443] Update makefile and gitignore --- .gitignore | 2 ++ Makefile | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index fb0b6c9..36ed7f7 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,9 @@ __pycache__/ # C extensions *.so *.dylib +*.metal *.dll +*.lib # Distribution / packaging .Python diff --git a/Makefile b/Makefile index d2f38da..66d93f3 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,9 @@ clean: - rm -rf _skbuild - rm llama_cpp/*.so - rm llama_cpp/*.dylib + - rm llama_cpp/*.metal - rm llama_cpp/*.dll + - rm llama_cpp/*.lib .PHONY: \ update \ From 74fbaae157766add8d23255c4c35528c23875cec Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 10 Jun 2023 18:19:48 -0400 Subject: [PATCH 373/443] Bump version --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cdda72f..bf6ed5d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.62] + ### Fixed - Metal support working diff --git a/pyproject.toml b/pyproject.toml index f631ec7..564059c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.61" +version = "0.1.62" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index e0f1271..bb423d8 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.61", + version="0.1.62", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 4eb245afd873efacd9df24001135c28e4ab75ed8 Mon Sep 17 00:00:00 2001 From: Matt Hoffner Date: Sat, 10 Jun 2023 15:59:26 -0700 Subject: [PATCH 374/443] Update README.md --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7487345..ee6e540 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ bash Miniforge3-MacOSX-arm64.sh ``` Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac. -### Installation with OpenBLAS / cuBLAS / CLBlast +### Installation with OpenBLAS / cuBLAS / CLBlast / Metal `llama.cpp` supports multiple BLAS backends for faster processing. Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend. @@ -64,6 +64,11 @@ To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before i CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python ``` +To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable before installing: + +```bash +CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python +``` ## High-level API From 3ea31930e57a45a0806488950e841efbb575369a Mon Sep 17 00:00:00 2001 From: Gabor Date: Sun, 11 Jun 2023 00:58:08 +0100 Subject: [PATCH 375/443] fixes abetlen/llama-cpp-python #358 --- llama_cpp/server/__main__.py | 2 +- llama_cpp/server/app.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 4fe1d94..1de4548 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -46,5 +46,5 @@ if __name__ == "__main__": app = create_app(settings=settings) uvicorn.run( - app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) + app, host=settings.host, port=settings.port ) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index f70d8f0..2191005 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -72,6 +72,12 @@ class Settings(BaseSettings): verbose: bool = Field( default=True, description="Whether to print debug information." ) + host: str = Field( + default="localhost", description="Listen address" + ) + port: int = Field( + default=8000, description="Listen port" + ) router = APIRouter() From 3129a0e7e581f6edd29a497a13ab014687867134 Mon Sep 17 00:00:00 2001 From: Gabor Date: Sun, 11 Jun 2023 01:11:24 +0100 Subject: [PATCH 376/443] correction to add back environment variable support <3 docker --- llama_cpp/server/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 1de4548..748a2af 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -46,5 +46,5 @@ if __name__ == "__main__": app = create_app(settings=settings) uvicorn.run( - app, host=settings.host, port=settings.port + app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port)) ) From efcf380490af7007389df698ddfe1b0f755e7069 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Jun 2023 21:03:40 +0000 Subject: [PATCH 377/443] Bump fastapi from 0.96.0 to 0.97.0 Bumps [fastapi](https://github.com/tiangolo/fastapi) from 0.96.0 to 0.97.0. - [Release notes](https://github.com/tiangolo/fastapi/releases) - [Commits](https://github.com/tiangolo/fastapi/compare/0.96.0...0.97.0) --- updated-dependencies: - dependency-name: fastapi dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 112 ++++++++++--------------------------------------- pyproject.toml | 2 +- 2 files changed, 23 insertions(+), 91 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4a9c572..1d95d76 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "anyio" version = "3.6.2" description = "High level compatibility layer for multiple asynchronous event loop implementations" -category = "main" optional = false python-versions = ">=3.6.2" files = [ @@ -25,7 +24,6 @@ trio = ["trio (>=0.16,<0.22)"] name = "black" version = "23.3.0" description = "The uncompromising code formatter." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -75,7 +73,6 @@ uvloop = ["uvloop (>=0.15.2)"] name = "bleach" version = "6.0.0" description = "An easy safelist-based HTML-sanitizing tool." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -94,7 +91,6 @@ css = ["tinycss2 (>=1.1.0,<1.2)"] name = "certifi" version = "2023.5.7" description = "Python package for providing Mozilla's CA Bundle." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -106,7 +102,6 @@ files = [ name = "cffi" version = "1.15.1" description = "Foreign Function Interface for Python calling C code." -category = "dev" optional = false python-versions = "*" files = [ @@ -183,7 +178,6 @@ pycparser = "*" name = "charset-normalizer" version = "3.1.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "dev" optional = false python-versions = ">=3.7.0" files = [ @@ -268,7 +262,6 @@ files = [ name = "click" version = "8.1.3" description = "Composable command line interface toolkit" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -283,7 +276,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -295,7 +287,6 @@ files = [ name = "cryptography" version = "40.0.2" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -333,11 +324,21 @@ test = ["iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-co test-randomorder = ["pytest-randomly"] tox = ["tox"] +[[package]] +name = "diskcache" +version = "5.6.1" +description = "Disk Cache -- Disk and file backed persistent cache." +optional = false +python-versions = ">=3" +files = [ + {file = "diskcache-5.6.1-py3-none-any.whl", hash = "sha256:558c6a2d5d7c721bb00e40711803d6804850c9f76c426ed81ecc627fe9d2ce2d"}, + {file = "diskcache-5.6.1.tar.gz", hash = "sha256:e4c978532feff5814c4cc00fe1e11e40501985946643d73220d41ee7737c72c3"}, +] + [[package]] name = "distro" version = "1.8.0" description = "Distro - an OS platform information API" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -349,7 +350,6 @@ files = [ name = "docutils" version = "0.20" description = "Docutils -- Python Documentation Utilities" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -361,7 +361,6 @@ files = [ name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -374,31 +373,26 @@ test = ["pytest (>=6)"] [[package]] name = "fastapi" -version = "0.96.0" +version = "0.97.0" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" -category = "main" optional = true python-versions = ">=3.7" files = [ - {file = "fastapi-0.96.0-py3-none-any.whl", hash = "sha256:b8e11fe81e81eab4e1504209917338e0b80f783878a42c2b99467e5e1019a1e9"}, - {file = "fastapi-0.96.0.tar.gz", hash = "sha256:71232d47c2787446991c81c41c249f8a16238d52d779c0e6b43927d3773dbe3c"}, + {file = "fastapi-0.97.0-py3-none-any.whl", hash = "sha256:95d757511c596409930bd20673358d4a4d709004edb85c5d24d6ffc48fabcbf2"}, + {file = "fastapi-0.97.0.tar.gz", hash = "sha256:b53248ee45f64f19bb7600953696e3edf94b0f7de94df1e5433fc5c6136fa986"}, ] [package.dependencies] -pydantic = ">=1.6.2,<1.7 || >1.7,<1.7.1 || >1.7.1,<1.7.2 || >1.7.2,<1.7.3 || >1.7.3,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0" starlette = ">=0.27.0,<0.28.0" [package.extras] all = ["email-validator (>=1.1.1)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] -dev = ["pre-commit (>=2.17.0,<3.0.0)", "ruff (==0.0.138)", "uvicorn[standard] (>=0.12.0,<0.21.0)"] -doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-markdownextradata-plugin (>=0.1.7,<0.3.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pyyaml (>=5.3.1,<7.0.0)", "typer-cli (>=0.0.13,<0.0.14)", "typer[all] (>=0.6.1,<0.8.0)"] -test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==23.1.0)", "coverage[toml] (>=6.5.0,<8.0)", "databases[sqlite] (>=0.3.2,<0.7.0)", "email-validator (>=1.1.1,<2.0.0)", "flask (>=1.1.2,<3.0.0)", "httpx (>=0.23.0,<0.24.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.982)", "orjson (>=3.2.1,<4.0.0)", "passlib[bcrypt] (>=1.7.2,<2.0.0)", "peewee (>=3.13.3,<4.0.0)", "pytest (>=7.1.3,<8.0.0)", "python-jose[cryptography] (>=3.3.0,<4.0.0)", "python-multipart (>=0.0.5,<0.0.7)", "pyyaml (>=5.3.1,<7.0.0)", "ruff (==0.0.138)", "sqlalchemy (>=1.3.18,<1.4.43)", "types-orjson (==3.6.2)", "types-ujson (==5.7.0.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,<6.0.0)"] [[package]] name = "ghp-import" version = "2.1.0" description = "Copy your docs directly to the gh-pages branch." -category = "dev" optional = false python-versions = "*" files = [ @@ -416,7 +410,6 @@ dev = ["flake8", "markdown", "twine", "wheel"] name = "griffe" version = "0.27.3" description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -431,7 +424,6 @@ colorama = ">=0.4" name = "h11" version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -443,7 +435,6 @@ files = [ name = "httpcore" version = "0.17.0" description = "A minimal low-level HTTP client." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -455,17 +446,16 @@ files = [ anyio = ">=3.0,<5.0" certifi = "*" h11 = ">=0.13,<0.15" -sniffio = ">=1.0.0,<2.0.0" +sniffio = "==1.*" [package.extras] http2 = ["h2 (>=3,<5)"] -socks = ["socksio (>=1.0.0,<2.0.0)"] +socks = ["socksio (==1.*)"] [[package]] name = "httpx" version = "0.24.1" description = "The next generation HTTP client." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -481,15 +471,14 @@ sniffio = "*" [package.extras] brotli = ["brotli", "brotlicffi"] -cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] -socks = ["socksio (>=1.0.0,<2.0.0)"] +socks = ["socksio (==1.*)"] [[package]] name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -501,7 +490,6 @@ files = [ name = "importlib-metadata" version = "6.6.0" description = "Read metadata from Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -521,7 +509,6 @@ testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packag name = "importlib-resources" version = "5.12.0" description = "Read resources from Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -540,7 +527,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -552,7 +538,6 @@ files = [ name = "jaraco-classes" version = "3.2.3" description = "Utility functions for Python class constructs" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -571,7 +556,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "jeepney" version = "0.8.0" description = "Low-level, pure Python DBus protocol wrapper." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -587,7 +571,6 @@ trio = ["async_generator", "trio"] name = "jinja2" version = "3.1.2" description = "A very fast and expressive template engine." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -605,7 +588,6 @@ i18n = ["Babel (>=2.7)"] name = "keyring" version = "23.13.1" description = "Store and access your passwords safely." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -630,7 +612,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "markdown" version = "3.3.7" description = "Python implementation of Markdown." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -648,7 +629,6 @@ testing = ["coverage", "pyyaml"] name = "markdown-it-py" version = "2.2.0" description = "Python port of markdown-it. Markdown parsing, done right!" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -673,7 +653,6 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] name = "markupsafe" version = "2.1.2" description = "Safely add untrusted strings to HTML/XML markup." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -733,7 +712,6 @@ files = [ name = "mdurl" version = "0.1.2" description = "Markdown URL utilities" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -745,7 +723,6 @@ files = [ name = "mergedeep" version = "1.3.4" description = "A deep merge function for 🐍." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -757,7 +734,6 @@ files = [ name = "mkdocs" version = "1.4.3" description = "Project documentation with Markdown." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -786,7 +762,6 @@ min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-imp name = "mkdocs-autorefs" version = "0.4.1" description = "Automatically link across pages in MkDocs." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -802,7 +777,6 @@ mkdocs = ">=1.1" name = "mkdocs-material" version = "9.1.15" description = "Documentation that simply works" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -825,7 +799,6 @@ requests = ">=2.26" name = "mkdocs-material-extensions" version = "1.1.1" description = "Extension pack for Python Markdown and MkDocs Material." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -837,7 +810,6 @@ files = [ name = "mkdocstrings" version = "0.22.0" description = "Automatic documentation from sources, for MkDocs." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -865,7 +837,6 @@ python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] name = "mkdocstrings-python" version = "0.10.1" description = "A Python handler for mkdocstrings." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -881,7 +852,6 @@ mkdocstrings = ">=0.20" name = "more-itertools" version = "9.1.0" description = "More routines for operating on iterables, beyond itertools" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -893,7 +863,6 @@ files = [ name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -905,7 +874,6 @@ files = [ name = "numpy" version = "1.24.3" description = "Fundamental package for array computing in Python" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -943,7 +911,6 @@ files = [ name = "packaging" version = "23.1" description = "Core utilities for Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -955,7 +922,6 @@ files = [ name = "pathspec" version = "0.11.1" description = "Utility library for gitignore style pattern matching of file paths." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -967,7 +933,6 @@ files = [ name = "pkginfo" version = "1.9.6" description = "Query metadata from sdists / bdists / installed packages." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -982,7 +947,6 @@ testing = ["pytest", "pytest-cov"] name = "platformdirs" version = "3.5.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -998,7 +962,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest- name = "pluggy" version = "1.0.0" description = "plugin and hook calling mechanisms for python" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1014,7 +977,6 @@ testing = ["pytest", "pytest-benchmark"] name = "pycparser" version = "2.21" description = "C parser in Python" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1026,7 +988,6 @@ files = [ name = "pydantic" version = "1.10.7" description = "Data validation and settings management using python type hints" -category = "main" optional = true python-versions = ">=3.7" files = [ @@ -1079,7 +1040,6 @@ email = ["email-validator (>=1.0.3)"] name = "pygments" version = "2.15.1" description = "Pygments is a syntax highlighting package written in Python." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1094,7 +1054,6 @@ plugins = ["importlib-metadata"] name = "pymdown-extensions" version = "9.11" description = "Extension pack for Python Markdown." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1110,7 +1069,6 @@ pyyaml = "*" name = "pytest" version = "7.3.1" description = "pytest: simple powerful testing with Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1133,7 +1091,6 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -1148,7 +1105,6 @@ six = ">=1.5" name = "pywin32-ctypes" version = "0.2.0" description = "" -category = "dev" optional = false python-versions = "*" files = [ @@ -1160,7 +1116,6 @@ files = [ name = "pyyaml" version = "6.0" description = "YAML parser and emitter for Python" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1210,7 +1165,6 @@ files = [ name = "pyyaml-env-tag" version = "0.1" description = "A custom YAML tag for referencing environment variables in YAML files. " -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1225,7 +1179,6 @@ pyyaml = "*" name = "readme-renderer" version = "37.3" description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1245,7 +1198,6 @@ md = ["cmarkgfm (>=0.8.0)"] name = "regex" version = "2023.5.5" description = "Alternative regular expression module, to replace re." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1343,7 +1295,6 @@ files = [ name = "requests" version = "2.30.0" description = "Python HTTP for Humans." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1365,7 +1316,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "requests-toolbelt" version = "1.0.0" description = "A utility belt for advanced users of python-requests" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1380,7 +1330,6 @@ requests = ">=2.0.1,<3.0.0" name = "rfc3986" version = "2.0.0" description = "Validating URI References per RFC 3986" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1395,7 +1344,6 @@ idna2008 = ["idna"] name = "rich" version = "13.3.5" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" -category = "dev" optional = false python-versions = ">=3.7.0" files = [ @@ -1415,7 +1363,6 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] name = "scikit-build" version = "0.17.6" description = "Improved build system generator for Python C/C++/Fortran/Cython extensions" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1440,7 +1387,6 @@ test = ["build (>=0.7)", "cython (>=0.25.1)", "importlib-metadata", "pytest (>=6 name = "secretstorage" version = "3.3.3" description = "Python bindings to FreeDesktop.org Secret Service API" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1456,7 +1402,6 @@ jeepney = ">=0.6" name = "setuptools" version = "67.7.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1473,7 +1418,6 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs ( name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -1485,7 +1429,6 @@ files = [ name = "sniffio" version = "1.3.0" description = "Sniff out which async library your code is running under" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1497,7 +1440,6 @@ files = [ name = "sse-starlette" version = "1.6.1" description = "\"SSE plugin for Starlette\"" -category = "main" optional = true python-versions = ">=3.8" files = [ @@ -1512,7 +1454,6 @@ starlette = "*" name = "starlette" version = "0.27.0" description = "The little ASGI library that shines." -category = "main" optional = true python-versions = ">=3.7" files = [ @@ -1531,7 +1472,6 @@ full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyam name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1543,7 +1483,6 @@ files = [ name = "twine" version = "4.0.2" description = "Collection of utilities for publishing packages on PyPI" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1566,7 +1505,6 @@ urllib3 = ">=1.26.0" name = "typing-extensions" version = "4.6.3" description = "Backported and Experimental Type Hints for Python 3.7+" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1578,7 +1516,6 @@ files = [ name = "urllib3" version = "2.0.2" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1596,7 +1533,6 @@ zstd = ["zstandard (>=0.18.0)"] name = "uvicorn" version = "0.22.0" description = "The lightning-fast ASGI server." -category = "main" optional = true python-versions = ">=3.7" files = [ @@ -1615,7 +1551,6 @@ standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", name = "watchdog" version = "3.0.0" description = "Filesystem events monitoring" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1655,7 +1590,6 @@ watchmedo = ["PyYAML (>=3.10)"] name = "webencodings" version = "0.5.1" description = "Character encoding aliases for legacy web content" -category = "dev" optional = false python-versions = "*" files = [ @@ -1667,7 +1601,6 @@ files = [ name = "wheel" version = "0.40.0" description = "A built-package format for Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1682,7 +1615,6 @@ test = ["pytest (>=6.0.0)"] name = "zipp" version = "3.15.0" description = "Backport of pathlib-compatible object wrapper for zip files" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1695,9 +1627,9 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [extras] -server = ["uvicorn", "fastapi", "sse-starlette"] +server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "5c3354c253bc7ab7c7577a9a3733c7a341e91176e1d0c13dc2e3f3dcc0971bbe" +content-hash = "8effb1d2fa3fa9026f291dd19e8fa20f84967e63c5e3c1e0cdfcdbaa547fb586" diff --git a/pyproject.toml b/pyproject.toml index 564059c..f2dd4b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ typing-extensions = "^4.6.3" numpy = "^1.20.0" diskcache = "^5.6.1" uvicorn = { version = "^0.22.0", optional = true } -fastapi = { version = "^0.96.0", optional = true } +fastapi = { version = "^0.97.0", optional = true } sse-starlette = { version = "^1.6.1", optional = true } [tool.poetry.group.dev.dependencies] From 94f63a66b9f6f24f3e0079efa0f98c5872ef3a82 Mon Sep 17 00:00:00 2001 From: Ian Scrivener Date: Tue, 13 Jun 2023 09:49:19 +1000 Subject: [PATCH 378/443] Create macos_install.md add MacOS Metal markdown install instructions --- docs/macos_install.md | 62 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 docs/macos_install.md diff --git a/docs/macos_install.md b/docs/macos_install.md new file mode 100644 index 0000000..7d46bc4 --- /dev/null +++ b/docs/macos_install.md @@ -0,0 +1,62 @@ + +# llama-cpp-python - MacOS Install with Metal GPU + + +**(1) Make sure you have xcode installed... at least the command line parts** +``` +# check the path of your xcode install +xcode-select -p + +# xcode installed returns +# /Applications/Xcode-beta.app/Contents/Developer + +# if xcode is missing then install it... it takes ages; +xcode-select --install +``` + +**(2) Install the conda version for MacOS that supports Metal GPU** +``` +wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh +bash Miniforge3-MacOSX-arm64.sh +``` + +**(3) Make a conda environment** +``` +conda create -n llama python=3.9.16 +conda activate llama +``` + +**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU** + *(you needed xcode installed in order pip to build/compile the C++ code)* +``` +pip uninstall llama-cpp-python -y +CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir +pip install 'llama-cpp-python[server]' + +# you should now have llama-cpp-python v0.1.62 installed +llama-cpp-python         0.1.62      + +``` + +**(4) Download a v3 ggml llama/vicuna/alpaca model** + - **ggmlv3** + - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0 + +https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-q4_0.bin +https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-uncensored-q4_0.bin +https://huggingface.co/TheBloke/LLaMa-7B-GGML/blob/main/llama-7b.ggmlv3.q4_0.bin +https://huggingface.co/TheBloke/LLaMa-13B-GGML/blob/main/llama-13b.ggmlv3.q4_0.bin + + +**(6) run the llama-cpp-python API server with MacOS Metal GPU support** +``` +# config your ggml model path +# make sure it is ggml v3 +# make sure it is q4_0 +export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin +python3 -m llama_cpp.server --model $MODEL --n_gpu_layers 1 +``` + +***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used* + + From 7ca50a3e45a89fda886a3f8179b7a70fc2bda197 Mon Sep 17 00:00:00 2001 From: Ian Scrivener Date: Tue, 13 Jun 2023 09:52:22 +1000 Subject: [PATCH 379/443] Update README.md add link to main README>md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index ee6e540..a4ca04d 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,9 @@ This package provides: Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). +Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md) + + ## Installation from PyPI (recommended) Install from PyPI (requires a c compiler): From 613dd70c8a9e54c373428055102283fdd468f09b Mon Sep 17 00:00:00 2001 From: Matt Dennewitz Date: Tue, 13 Jun 2023 00:56:05 -0500 Subject: [PATCH 380/443] Update README.md Fixes typo in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ee6e540..c099cbf 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Install from PyPI (requires a c compiler): pip install llama-cpp-python ``` -The above command will attempt to install the package and build build `llama.cpp` from source. +The above command will attempt to install the package and build `llama.cpp` from source. This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different compiler options, please add the following flags to ensure that the package is rebuilt correctly: From 10b0cb727b249050e16edc8c14c1526cbea0e500 Mon Sep 17 00:00:00 2001 From: Okabintaro <103938900+Okabintaro@users.noreply.github.com> Date: Tue, 13 Jun 2023 12:03:31 +0200 Subject: [PATCH 381/443] fix: Make LLamaState pickable for disk cache I fixed the issue by making the saved state a bytes object instead of the ctypes one which can't be pickled. --- llama_cpp/llama.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4b6ce8c..0c3d72b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -141,7 +141,9 @@ class LlamaDiskCache(BaseLlamaCache): if _key is None: raise KeyError("Key not found") value: "LlamaState" = self.cache.pop(_key) # type: ignore - self.cache.push(_key, side="front") # type: ignore + # NOTE: This puts an integer as key in cache, which breaks, + # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens + # self.cache.push(_key, side="front") # type: ignore return value def __contains__(self, key: Sequence[int]) -> bool: @@ -168,7 +170,7 @@ class LlamaState: eval_logits: Deque[List[float]], input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single], - llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] + llama_state: bytes, llama_state_size: int, ): self.eval_tokens = eval_tokens @@ -1503,7 +1505,7 @@ class Llama: eval_logits=self.eval_logits.copy(), scores=self._scores.copy(), input_ids=self._input_ids.copy(), - llama_state=llama_state_compact, + llama_state=bytes(llama_state_compact), llama_state_size=n_bytes, ) @@ -1514,7 +1516,10 @@ class Llama: self._scores = state.scores.copy() self._input_ids = state.input_ids.copy() state_size = state.llama_state_size - if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size: + LLamaStateArrayType = (llama_cpp.c_uint8 * state_size) + llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state) + + if llama_cpp.llama_set_state_data(self.ctx, llama_state) != state_size: raise RuntimeError("Failed to set llama state data") def n_ctx(self) -> int: From fe41cb9043e4ca54e7a0989baae68eb5b730a0b4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 13 Jun 2023 15:07:50 +0000 Subject: [PATCH 382/443] Bump pytest from 7.3.1 to 7.3.2 Bumps [pytest](https://github.com/pytest-dev/pytest) from 7.3.1 to 7.3.2. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/7.3.1...7.3.2) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 10 +++++----- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index 1d95d76..e720acc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1067,13 +1067,13 @@ pyyaml = "*" [[package]] name = "pytest" -version = "7.3.1" +version = "7.3.2" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, - {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, + {file = "pytest-7.3.2-py3-none-any.whl", hash = "sha256:cdcbd012c9312258922f8cd3f1b62a6580fdced17db6014896053d47cddf9295"}, + {file = "pytest-7.3.2.tar.gz", hash = "sha256:ee990a3cc55ba808b80795a79944756f315c67c12b56abd3ac993a7b8c17030b"}, ] [package.dependencies] @@ -1085,7 +1085,7 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "python-dateutil" @@ -1632,4 +1632,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "8effb1d2fa3fa9026f291dd19e8fa20f84967e63c5e3c1e0cdfcdbaa547fb586" +content-hash = "1d809f04ae0543b3476915b5b767e070811908cc75032f8dc8867294cbf0055d" diff --git a/pyproject.toml b/pyproject.toml index f2dd4b7..9d1be84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.22.0"} mkdocs-material = "^9.1.15" -pytest = "^7.3.1" +pytest = "^7.3.2" httpx = "^0.24.1" scikit-build = "0.17.6" From 715f98c591e9249acc051e73b9757666e656ab57 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 21:40:13 -0400 Subject: [PATCH 383/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 22 ++++++++++++++++++++++ vendor/llama.cpp | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 29136c7..be5e9c3 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -155,6 +155,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # int n_gpu_layers; // number of layers to store in VRAM # int main_gpu; // the GPU that is used for scratch and small tensors # float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs +# bool low_vram; // if true, reduce VRAM usage at the cost of performance # int seed; // RNG seed, -1 for random # bool f16_kv; // use fp16 for KV cache @@ -177,6 +178,7 @@ class llama_context_params(Structure): ("n_gpu_layers", c_int), ("main_gpu", c_int), ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), + ("low_vram", c_bool), ("seed", c_int), ("f16_kv", c_bool), ( @@ -555,6 +557,26 @@ _lib.llama_n_embd.argtypes = [llama_context_p] _lib.llama_n_embd.restype = c_int +# // Get the vocabulary as output parameters. +# // Returns number of results. +# LLAMA_API int llama_get_vocab( +# const struct llama_context * ctx, +# const char * * strings, +# float * scores, +# int capacity); +def llama_get_vocab( + ctx: llama_context_p, + strings, # type: Array[c_char_p] # type: ignore + scores, # type: Array[c_float] # type: ignore + capacity: c_int, +) -> int: + return _lib.llama_get_vocab(ctx, strings, scores, capacity) + + +_lib.llama_get_vocab.argtypes = [llama_context_p, c_char_p, c_float, c_int] +_lib.llama_get_vocab.restype = c_int + + # Token logits obtained from the last call to llama_eval() # The logits for the last token are stored in the last row # Can be mutated in order to change the probabilities of the next token diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4de0334..254a7a7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4de0334f5cabf4696eced2e5d6e279fdfaa6c0f2 +Subproject commit 254a7a7a5ff4c874ff8488f1f5cbdd7e9c89d682 From f27393ab7ed06c769aba414dcaf2d544ab0c4c35 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 21:46:48 -0400 Subject: [PATCH 384/443] Add additional verbose logs for cache --- llama_cpp/server/app.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 2191005..e248472 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -119,8 +119,12 @@ def create_app(settings: Optional[Settings] = None): ) if settings.cache: if settings.cache_type == "disk": + if settings.verbose: + print(f"Using disk cache with size {settings.cache_size}") cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) else: + if settings.verbose: + print(f"Using ram cache with size {settings.cache_size}") cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) From f7c5cfaf503eb251202f609dbbc8b5b337771de5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 22:08:28 -0400 Subject: [PATCH 385/443] Format server options --- llama_cpp/server/app.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 999d1e6..0d011f0 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -260,18 +260,18 @@ class CreateCompletionRequest(BaseModel): presence_penalty: Optional[float] = presence_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) - logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) + logprobs: Optional[int] = Field(None) # ignored or currently unsupported model: Optional[str] = model_field n: Optional[int] = 1 - logprobs: Optional[int] = Field(None) best_of: Optional[int] = 1 user: Optional[str] = Field(None) # llama.cpp specific parameters top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) class Config: schema_extra = { @@ -424,7 +424,6 @@ class CreateChatCompletionRequest(BaseModel): presence_penalty: Optional[float] = presence_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) - logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) # ignored or currently unsupported model: Optional[str] = model_field @@ -434,6 +433,7 @@ class CreateChatCompletionRequest(BaseModel): # llama.cpp specific parameters top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) class Config: schema_extra = { From 44b83cada5a9183d42a42670252b97b2ea7b37f3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 22:12:33 -0400 Subject: [PATCH 386/443] Add low_vram parameter --- llama_cpp/llama.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 46a9aeb..a6f1e76 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -219,6 +219,7 @@ class Llama: last_n_tokens_size: int = 64, lora_base: Optional[str] = None, lora_path: Optional[str] = None, + low_vram: bool = False, verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -260,6 +261,7 @@ class Llama: self.params.use_mmap = use_mmap if lora_path is None else False self.params.use_mlock = use_mlock self.params.embedding = embedding + self.params.low_vram = low_vram self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) @@ -1447,6 +1449,7 @@ class Llama: use_mmap=self.params.use_mmap, use_mlock=self.params.use_mlock, embedding=self.params.embedding, + low_vram=self.params.low_vram, last_n_tokens_size=self.last_n_tokens_size, n_batch=self.n_batch, n_threads=self.n_threads, @@ -1470,6 +1473,7 @@ class Llama: use_mmap=state["use_mmap"], use_mlock=state["use_mlock"], embedding=state["embedding"], + low_vram=state["low_vram"], n_threads=state["n_threads"], n_batch=state["n_batch"], last_n_tokens_size=state["last_n_tokens_size"], From 1e20be6d0c0ada75bbd30ae855d17569dd346b8f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 22:13:42 -0400 Subject: [PATCH 387/443] Add low_vram to server settings --- llama_cpp/server/app.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 0d011f0..313e27d 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -48,6 +48,10 @@ class Settings(BaseSettings): description="Use mmap.", ) embedding: bool = Field(default=True, description="Whether to use embeddings.") + low_vram: bool = Field( + default=False, + description="Whether to use less VRAM. This will reduce performance.", + ) last_n_tokens_size: int = Field( default=64, ge=0, From 54e2e4ffde8eac57ca3f0ad117b878837d7c3d1f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 22:15:22 -0400 Subject: [PATCH 388/443] Move metal docs to metal section of README. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c29202c..0e62f3d 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,6 @@ This package provides: Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). -Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md) - ## Installation from PyPI (recommended) @@ -73,6 +71,8 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python ``` +Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md) + ## High-level API The high-level API provides a simple managed interface through the `Llama` class. From d938e5900369d4af2dfe86e1f51cd402cb58c87c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 22:15:44 -0400 Subject: [PATCH 389/443] Bump version --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf6ed5d..7a01f6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- (llama.cpp) Add full gpu utilisation in CUDA +- (llama.cpp) Add get_vocab +- (llama.cpp) Add low_vram parameter +- (server) Add logit_bias parameter + ## [0.1.62] ### Fixed diff --git a/pyproject.toml b/pyproject.toml index 9d1be84..281e1bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.62" +version = "0.1.63" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index bb423d8..0449149 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.62", + version="0.1.63", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From fd9f294b3a5194844f50d3b77cc71c51b8ffbb52 Mon Sep 17 00:00:00 2001 From: imaprogrammer <46126206+nb-programmer@users.noreply.github.com> Date: Fri, 16 Jun 2023 14:11:57 +0530 Subject: [PATCH 390/443] Update llama.py: Added how many input tokens in ValueError exception --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index a6f1e76..366f050 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -814,7 +814,7 @@ class Llama: llama_cpp.llama_reset_timings(self.ctx) if len(prompt_tokens) > self._n_ctx: - raise ValueError(f"Requested tokens exceed context window of {self._n_ctx}") + raise ValueError(f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}") # Truncate max_tokens if requested tokens would exceed the context window max_tokens = ( From 37d5192a92a9e3a861027af03dab7a792436fad7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 16 Jun 2023 10:41:51 -0400 Subject: [PATCH 391/443] Update docs --- docs/macos_install.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/docs/macos_install.md b/docs/macos_install.md index 7d46bc4..33dcb5d 100644 --- a/docs/macos_install.md +++ b/docs/macos_install.md @@ -38,14 +38,11 @@ llama-cpp-python         0.1.62      ``` -**(4) Download a v3 ggml llama/vicuna/alpaca model** +**(4) Download a v3 ggml model** - **ggmlv3** - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0 -https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-q4_0.bin -https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-uncensored-q4_0.bin -https://huggingface.co/TheBloke/LLaMa-7B-GGML/blob/main/llama-7b.ggmlv3.q4_0.bin -https://huggingface.co/TheBloke/LLaMa-13B-GGML/blob/main/llama-13b.ggmlv3.q4_0.bin +https://huggingface.co/TheBloke/open-llama-7b-open-instruct-GGML **(6) run the llama-cpp-python API server with MacOS Metal GPU support** From d7153abcf820b6ad39192857a1be8b806595990d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 16 Jun 2023 23:11:14 -0400 Subject: [PATCH 392/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 6 +++--- vendor/llama.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index be5e9c3..d6be0ea 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -618,7 +618,7 @@ _lib.llama_token_to_str.restype = c_char_p # Special tokens -# LLAMA_API llama_token llama_token_bos(); +# LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence def llama_token_bos() -> int: return _lib.llama_token_bos() @@ -627,7 +627,7 @@ _lib.llama_token_bos.argtypes = [] _lib.llama_token_bos.restype = llama_token -# LLAMA_API llama_token llama_token_eos(); +# LLAMA_API llama_token llama_token_eos(); // end-of-sentence def llama_token_eos() -> int: return _lib.llama_token_eos() @@ -636,7 +636,7 @@ _lib.llama_token_eos.argtypes = [] _lib.llama_token_eos.restype = llama_token -# LLAMA_API llama_token llama_token_nl(); +# LLAMA_API llama_token llama_token_nl(); // next-line def llama_token_nl() -> int: return _lib.llama_token_nl() diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 254a7a7..d411968 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 254a7a7a5ff4c874ff8488f1f5cbdd7e9c89d682 +Subproject commit d411968e990c37f51328849c96a743dd78f3c3dd From 60426b23cc6b9f715214ec09a144e477bfcb2b06 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 17 Jun 2023 13:37:14 -0400 Subject: [PATCH 393/443] Update llama.cpp --- CHANGELOG.md | 6 ++++++ vendor/llama.cpp | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a01f6d..9fba95d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- (llama.cpp) Update llama.cpp + +## [0.1.63] + +### Added + - (llama.cpp) Add full gpu utilisation in CUDA - (llama.cpp) Add get_vocab - (llama.cpp) Add low_vram parameter diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d411968..4f9c43e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d411968e990c37f51328849c96a743dd78f3c3dd +Subproject commit 4f9c43e3bd488b7561119785485e1155dba338d7 From d410f12fae32bf77a8eedc05e7bef263dc6b7cfd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 17 Jun 2023 13:38:48 -0400 Subject: [PATCH 394/443] Update docs. Closes #386 --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 366f050..a0b2030 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -228,7 +228,7 @@ class Llama: model_path: Path to the model. n_ctx: Maximum context size. n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined. - seed: Random seed. 0 for random. + seed: Random seed. -1 for random. f16_kv: Use half-precision for key/value cache. logits_all: Return logits for all tokens, not just the last token. vocab_only: Only load the vocabulary no weights. From c7d7d5b656cb63ab54c17483dec2ba36b45142f5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 17 Jun 2023 13:39:48 -0400 Subject: [PATCH 395/443] Update Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fba95d..c4cd88c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - (llama.cpp) Update llama.cpp +- Fix docs for seed. Set -1 for random. ## [0.1.63] From 44dcb5cf715cd384af85b99d13190c8d96f1f85e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 18 Jun 2023 09:37:20 -0400 Subject: [PATCH 396/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4f9c43e..8596af4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4f9c43e3bd488b7561119785485e1155dba338d7 +Subproject commit 8596af427722775f0df4a7c90b9af067ba90d4ef From 92b0013427be9a1fcea29a3090aa51d0fd8fb35f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 18 Jun 2023 09:48:43 -0400 Subject: [PATCH 397/443] Bump version --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c4cd88c..0060af5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.64] + ### Added - (llama.cpp) Update llama.cpp diff --git a/pyproject.toml b/pyproject.toml index 281e1bb..eb7d23b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.63" +version = "0.1.64" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 0449149..cc17564 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.63", + version="0.1.64", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From d5974a1096860e13a7dd6c123bd4557497c6b70c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 19 Jun 2023 21:07:49 +0000 Subject: [PATCH 398/443] Bump mkdocs-material from 9.1.15 to 9.1.16 Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.15 to 9.1.16. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.15...9.1.16) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index e720acc..e006449 100644 --- a/poetry.lock +++ b/poetry.lock @@ -775,13 +775,13 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.15" +version = "9.1.16" description = "Documentation that simply works" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.15-py3-none-any.whl", hash = "sha256:b49e12869ab464558e2dd3c5792da5b748a7e0c48ee83b4d05715f98125a7a39"}, - {file = "mkdocs_material-9.1.15.tar.gz", hash = "sha256:8513ab847c9a541ed3d11a3a7eed556caf72991ee786c31c5aac6691a121088a"}, + {file = "mkdocs_material-9.1.16-py3-none-any.whl", hash = "sha256:f9e62558a6b01ffac314423cbc223d970c25fbc78999860226245b64e64d6751"}, + {file = "mkdocs_material-9.1.16.tar.gz", hash = "sha256:1021bfea20f00a9423530c8c2ae9be3c78b80f5a527b3f822e6de3d872e5ab79"}, ] [package.dependencies] @@ -1632,4 +1632,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "1d809f04ae0543b3476915b5b767e070811908cc75032f8dc8867294cbf0055d" +content-hash = "fabdd2d7dba563fe7b01b4592dfb33e520b5f6e67317ce5f03205ecba396a577" diff --git a/pyproject.toml b/pyproject.toml index eb7d23b..19015b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.22.0"} -mkdocs-material = "^9.1.15" +mkdocs-material = "^9.1.16" pytest = "^7.3.2" httpx = "^0.24.1" scikit-build = "0.17.6" From e37798777e8aed908787f209396190438d724c72 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 20 Jun 2023 11:25:10 -0400 Subject: [PATCH 399/443] Update llama.cpp --- CHANGELOG.md | 4 ++++ llama_cpp/llama_cpp.py | 26 +++++++++++--------------- vendor/llama.cpp | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0060af5..a6cb99b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- (llama.cpp) Fix struct misalignment bug + ## [0.1.64] ### Added diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index d6be0ea..a516829 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -150,47 +150,43 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # struct llama_context_params { +# int seed; // RNG seed, -1 for random # int n_ctx; // text context # int n_batch; // prompt processing batch size # int n_gpu_layers; // number of layers to store in VRAM # int main_gpu; // the GPU that is used for scratch and small tensors # float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs -# bool low_vram; // if true, reduce VRAM usage at the cost of performance -# int seed; // RNG seed, -1 for random +# // called with a progress value between 0 and 1, pass NULL to disable +# llama_progress_callback progress_callback; +# // context pointer passed to the progress callback +# void * progress_callback_user_data; +# // Keep the booleans together to avoid misalignment during copy-by-value. +# bool low_vram; // if true, reduce VRAM usage at the cost of performance # bool f16_kv; // use fp16 for KV cache # bool logits_all; // the llama_eval() call computes all logits, not just the last one # bool vocab_only; // only load the vocabulary, no weights # bool use_mmap; // use mmap if possible # bool use_mlock; // force system to keep model in RAM # bool embedding; // embedding mode only - - -# // called with a progress value between 0 and 1, pass NULL to disable -# llama_progress_callback progress_callback; -# // context pointer passed to the progress callback -# void * progress_callback_user_data; # }; class llama_context_params(Structure): _fields_ = [ + ("seed", c_int), ("n_ctx", c_int), ("n_batch", c_int), ("n_gpu_layers", c_int), ("main_gpu", c_int), ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), + ("progress_callback", llama_progress_callback), + ("progress_callback_user_data", c_void_p), ("low_vram", c_bool), - ("seed", c_int), ("f16_kv", c_bool), - ( - "logits_all", - c_bool, - ), + ("logits_all", c_bool), ("vocab_only", c_bool), ("use_mmap", c_bool), ("use_mlock", c_bool), ("embedding", c_bool), - ("progress_callback", llama_progress_callback), - ("progress_callback_user_data", c_void_p), ] diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8596af4..2322ec2 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8596af427722775f0df4a7c90b9af067ba90d4ef +Subproject commit 2322ec223a21625dfe9bd73ee677444a98a24ac9 From 3e7eae479631890196823324e0573416408f52a0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 20 Jun 2023 11:25:44 -0400 Subject: [PATCH 400/443] Bump Version --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6cb99b..d5925bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.65] + ### Added - (llama.cpp) Fix struct misalignment bug diff --git a/pyproject.toml b/pyproject.toml index eb7d23b..dac026c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.64" +version = "0.1.65" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index cc17564..9f27648 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.64", + version="0.1.65", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 282698b6d383e216e129856f25b0ca41348ad525 Mon Sep 17 00:00:00 2001 From: Alexey Date: Fri, 23 Jun 2023 00:19:24 +0400 Subject: [PATCH 401/443] server: pass seed param from command line to llama --- llama_cpp/server/app.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 313e27d..ef319c7 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -30,6 +30,9 @@ class Settings(BaseSettings): ge=0, description="The number of layers to put on the GPU. The rest will be on the CPU.", ) + seed: int = Field( + default=1337, description="Random seed. -1 for random." + ) n_batch: int = Field( default=512, ge=1, description="The batch size to use per eval." ) @@ -109,6 +112,7 @@ def create_app(settings: Optional[Settings] = None): llama = llama_cpp.Llama( model_path=settings.model, n_gpu_layers=settings.n_gpu_layers, + seed=settings.seed, f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, use_mmap=settings.use_mmap, From d788fb49bf1ff2f41e651ded4e5b788f2185caad Mon Sep 17 00:00:00 2001 From: samfundev Date: Sat, 24 Jun 2023 15:51:46 -0400 Subject: [PATCH 402/443] Only concatenate after all batches are done --- llama_cpp/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index a0b2030..d367601 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -405,6 +405,7 @@ class Llama: """ assert self.ctx is not None n_ctx = self._n_ctx + scores = [] for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] n_past = min(n_ctx - len(batch), len(self._input_ids)) @@ -430,9 +431,8 @@ class Llama: logits_view = llama_cpp.llama_get_logits(self.ctx) logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)] self.eval_logits.extend(logits) - self._scores: npt.NDArray[np.single] = np.concatenate( - (self._scores, np.array(logits, dtype=np.single)), axis=0 - ) + scores.append(np.array(logits, dtype=np.single)) + self._scores = np.concatenate(scores) def _sample( self, From b4a3db3e546df5ac030465b3b75d3b190548b1ed Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 26 Jun 2023 08:50:30 -0400 Subject: [PATCH 403/443] Update type signature --- llama_cpp/llama.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index d367601..3465cd4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -325,7 +325,7 @@ class Llama: self._token_eos = Llama.token_eos() self._input_ids = np.array([], dtype=np.intc) - self._scores = np.ndarray((0, self._n_vocab), dtype=np.single) + self._scores: npt.NDArray[np.single] = np.ndarray((0, self._n_vocab), dtype=np.single) def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: """Tokenize a string. @@ -405,7 +405,7 @@ class Llama: """ assert self.ctx is not None n_ctx = self._n_ctx - scores = [] + scores: List[npt.NDArray[np.single]] = [] for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] n_past = min(n_ctx - len(batch), len(self._input_ids)) From 952228407ebd68ef621ad747e3561c821d1c02d3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 26 Jun 2023 08:50:38 -0400 Subject: [PATCH 404/443] Update llama.cpp --- llama_cpp/llama.py | 9 ++++-- llama_cpp/llama_cpp.py | 66 +++++++++++++++++++++++++++++++++++++++++- vendor/llama.cpp | 2 +- 3 files changed, 72 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 3465cd4..3319cde 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -282,15 +282,18 @@ class Llama: if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") - self.ctx = llama_cpp.llama_init_from_file( + self.model = llama_cpp.llama_load_model_from_file( self.model_path.encode("utf-8"), self.params ) + assert self.model is not None + + self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params) assert self.ctx is not None if self.lora_path: - if llama_cpp.llama_apply_lora_from_file( - self.ctx, + if llama_cpp.llama_model_apply_lora_from_file( + self.model, llama_cpp.c_char_p(self.lora_path.encode("utf-8")), llama_cpp.c_char_p(self.lora_base.encode("utf-8")) if self.lora_base is not None diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index a516829..23643e2 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -15,7 +15,7 @@ from ctypes import ( c_size_t, ) import pathlib -from typing import List +from typing import List, Union # Load the library @@ -105,6 +105,9 @@ LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_VERSION = c_int(1) +# struct llama_model; +llama_model_p = c_void_p + # struct llama_context; llama_context_p = c_void_p @@ -161,6 +164,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # // context pointer passed to the progress callback # void * progress_callback_user_data; + # // Keep the booleans together to avoid misalignment during copy-by-value. # bool low_vram; // if true, reduce VRAM usage at the cost of performance # bool f16_kv; // use fp16 for KV cache @@ -296,6 +300,41 @@ _lib.llama_init_backend.argtypes = [] _lib.llama_init_backend.restype = None +# LLAMA_API struct llama_model * llama_load_model_from_file( +# const char * path_model, +# struct llama_context_params params); +def llama_load_model_from_file( + path_model: bytes, params: llama_context_params +) -> llama_model_p: + return _lib.llama_load_model_from_file(path_model, params) + + +_lib.llama_load_model_from_file.argtypes = [c_char_p, llama_context_params] +_lib.llama_load_model_from_file.restype = llama_model_p + + +# LLAMA_API void llama_free_model(struct llama_model * model); +def llama_free_model(model: llama_model_p): + return _lib.llama_free_model(model) + + +_lib.llama_free_model.argtypes = [llama_model_p] +_lib.llama_free_model.restype = None + + +# LLAMA_API struct llama_context * llama_new_context_with_model( +# struct llama_model * model, +# struct llama_context_params params); +def llama_new_context_with_model( + model: llama_model_p, params: llama_context_params +) -> llama_context_p: + return _lib.llama_new_context_with_model(model, params) + + +_lib.llama_new_context_with_model.argtypes = [llama_model_p, llama_context_params] +_lib.llama_new_context_with_model.restype = llama_context_p + + # LLAMA_API int64_t llama_time_us(); def llama_time_us() -> int: return _lib.llama_time_us() @@ -376,6 +415,31 @@ _lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, _lib.llama_apply_lora_from_file.restype = c_int +# LLAMA_API int llama_model_apply_lora_from_file( +# const struct llama_model * model, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); +def llama_model_apply_lora_from_file( + model: llama_model_p, + path_lora: Union[c_char_p, bytes], + path_base_model: Union[c_char_p, bytes], + n_threads: c_int, +) -> int: + return _lib.llama_model_apply_lora_from_file( + model, path_lora, path_base_model, n_threads + ) + + +_lib.llama_model_apply_lora_from_file.argtypes = [ + llama_model_p, + c_char_p, + c_char_p, + c_int, +] +_lib.llama_model_apply_lora_from_file.restype = c_int + + # Returns the number of tokens in the KV cache # LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int: diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2322ec2..447ccbe 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2322ec223a21625dfe9bd73ee677444a98a24ac9 +Subproject commit 447ccbe8c39332fcdd0d98a041b6e2ff6f06219d From 5193af297bb69c16ac6aa523db25272701c9fb34 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 26 Jun 2023 08:53:54 -0400 Subject: [PATCH 405/443] Bump version --- CHANGELOG.md | 11 +++++++++++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5925bc..f0d70b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.66] + +## Added + +- (llama.cpp) New model API + +## Fixed + +- Performance issue during eval caused by looped np.concatenate call +- State pickling issue when saving cache to disk + ## [0.1.65] ### Added diff --git a/pyproject.toml b/pyproject.toml index c6ffe38..bd774c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.65" +version = "0.1.66" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 9f27648..fbd5551 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.65", + version="0.1.66", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 155dedf28f3a8be4f37333e5cecacc8491f8fe1e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 26 Jun 2023 16:25:17 -0400 Subject: [PATCH 406/443] Add readthedocsc config --- .readthedocs.yaml | 25 +++++++++++++++++++++++++ pyproject.toml | 5 +++++ 2 files changed, 30 insertions(+) create mode 100644 .readthedocs.yaml diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..27a168b --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,25 @@ +# Read the Docs configuration file for MkDocs projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + +mkdocs: + configuration: mkdocs.yml + +python: + install: + - method: pip + path: . + extra_requirements: + - docs + +submodules: + include: all + recursive: true \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index bd774c0..0a28091 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,3 +42,8 @@ requires = [ "ninja", ] build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +server = ["uvicorn", "fastapi", "sse-starlette"] +test = ["pytest"] +docs = ["mkdocs", "mkdocstrings[python]", "mkdocs-material"] From 66b8b979a53a53df18c88fefa3e9f7c51fd4b220 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 26 Jun 2023 16:31:16 -0400 Subject: [PATCH 407/443] Update readthedocs setup --- .readthedocs.yaml | 3 +-- docs/requirements.txt | 3 +++ pyproject.toml | 7 +------ 3 files changed, 5 insertions(+), 8 deletions(-) create mode 100644 docs/requirements.txt diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 27a168b..ff3e950 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -17,8 +17,7 @@ python: install: - method: pip path: . - extra_requirements: - - docs + - requirements: docs/requirements.txt submodules: include: all diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..199bd4f --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,3 @@ +mkdocs +mkdocs-material +mkdocstrings[python] \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 0a28091..6a6d2c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,9 +41,4 @@ requires = [ "cmake>=3.18", "ninja", ] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -server = ["uvicorn", "fastapi", "sse-starlette"] -test = ["pytest"] -docs = ["mkdocs", "mkdocstrings[python]", "mkdocs-material"] +build-backend = "setuptools.build_meta" \ No newline at end of file From 452929404fc407b060f87d6bddee4c6eb36c84dc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 26 Jun 2023 16:35:38 -0400 Subject: [PATCH 408/443] Updated docs link --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0e62f3d..bd2e778 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # 🦙 Python Bindings for `llama.cpp` -[![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python) +[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest) [![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml) [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) @@ -15,7 +15,7 @@ This package provides: - OpenAI-like API - LangChain compatibility -Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). +Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest). ## Installation from PyPI (recommended) From c9a8b7eb43267059080d747277f32cfa66203929 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Jun 2023 21:03:34 +0000 Subject: [PATCH 409/443] Bump numpy from 1.24.3 to 1.24.4 Bumps [numpy](https://github.com/numpy/numpy) from 1.24.3 to 1.24.4. - [Release notes](https://github.com/numpy/numpy/releases) - [Changelog](https://github.com/numpy/numpy/blob/main/doc/RELEASE_WALKTHROUGH.rst) - [Commits](https://github.com/numpy/numpy/compare/v1.24.3...v1.24.4) --- updated-dependencies: - dependency-name: numpy dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 60 +++++++++++++++++++++++++------------------------- pyproject.toml | 2 +- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/poetry.lock b/poetry.lock index e006449..041bb03 100644 --- a/poetry.lock +++ b/poetry.lock @@ -872,39 +872,39 @@ files = [ [[package]] name = "numpy" -version = "1.24.3" +version = "1.24.4" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.8" files = [ - {file = "numpy-1.24.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3c1104d3c036fb81ab923f507536daedc718d0ad5a8707c6061cdfd6d184e570"}, - {file = "numpy-1.24.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:202de8f38fc4a45a3eea4b63e2f376e5f2dc64ef0fa692838e31a808520efaf7"}, - {file = "numpy-1.24.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8535303847b89aa6b0f00aa1dc62867b5a32923e4d1681a35b5eef2d9591a463"}, - {file = "numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d926b52ba1367f9acb76b0df6ed21f0b16a1ad87c6720a1121674e5cf63e2b6"}, - {file = "numpy-1.24.3-cp310-cp310-win32.whl", hash = "sha256:f21c442fdd2805e91799fbe044a7b999b8571bb0ab0f7850d0cb9641a687092b"}, - {file = "numpy-1.24.3-cp310-cp310-win_amd64.whl", hash = "sha256:ab5f23af8c16022663a652d3b25dcdc272ac3f83c3af4c02eb8b824e6b3ab9d7"}, - {file = "numpy-1.24.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9a7721ec204d3a237225db3e194c25268faf92e19338a35f3a224469cb6039a3"}, - {file = "numpy-1.24.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d6cc757de514c00b24ae8cf5c876af2a7c3df189028d68c0cb4eaa9cd5afc2bf"}, - {file = "numpy-1.24.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76e3f4e85fc5d4fd311f6e9b794d0c00e7002ec122be271f2019d63376f1d385"}, - {file = "numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1d3c026f57ceaad42f8231305d4653d5f05dc6332a730ae5c0bea3513de0950"}, - {file = "numpy-1.24.3-cp311-cp311-win32.whl", hash = "sha256:c91c4afd8abc3908e00a44b2672718905b8611503f7ff87390cc0ac3423fb096"}, - {file = "numpy-1.24.3-cp311-cp311-win_amd64.whl", hash = "sha256:5342cf6aad47943286afa6f1609cad9b4266a05e7f2ec408e2cf7aea7ff69d80"}, - {file = "numpy-1.24.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7776ea65423ca6a15255ba1872d82d207bd1e09f6d0894ee4a64678dd2204078"}, - {file = "numpy-1.24.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ae8d0be48d1b6ed82588934aaaa179875e7dc4f3d84da18d7eae6eb3f06c242c"}, - {file = "numpy-1.24.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecde0f8adef7dfdec993fd54b0f78183051b6580f606111a6d789cd14c61ea0c"}, - {file = "numpy-1.24.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4749e053a29364d3452c034827102ee100986903263e89884922ef01a0a6fd2f"}, - {file = "numpy-1.24.3-cp38-cp38-win32.whl", hash = "sha256:d933fabd8f6a319e8530d0de4fcc2e6a61917e0b0c271fded460032db42a0fe4"}, - {file = "numpy-1.24.3-cp38-cp38-win_amd64.whl", hash = "sha256:56e48aec79ae238f6e4395886b5eaed058abb7231fb3361ddd7bfdf4eed54289"}, - {file = "numpy-1.24.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4719d5aefb5189f50887773699eaf94e7d1e02bf36c1a9d353d9f46703758ca4"}, - {file = "numpy-1.24.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0ec87a7084caa559c36e0a2309e4ecb1baa03b687201d0a847c8b0ed476a7187"}, - {file = "numpy-1.24.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea8282b9bcfe2b5e7d491d0bf7f3e2da29700cec05b49e64d6246923329f2b02"}, - {file = "numpy-1.24.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:210461d87fb02a84ef243cac5e814aad2b7f4be953b32cb53327bb49fd77fbb4"}, - {file = "numpy-1.24.3-cp39-cp39-win32.whl", hash = "sha256:784c6da1a07818491b0ffd63c6bbe5a33deaa0e25a20e1b3ea20cf0e43f8046c"}, - {file = "numpy-1.24.3-cp39-cp39-win_amd64.whl", hash = "sha256:d5036197ecae68d7f491fcdb4df90082b0d4960ca6599ba2659957aafced7c17"}, - {file = "numpy-1.24.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:352ee00c7f8387b44d19f4cada524586f07379c0d49270f87233983bc5087ca0"}, - {file = "numpy-1.24.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7d6acc2e7524c9955e5c903160aa4ea083736fde7e91276b0e5d98e6332812"}, - {file = "numpy-1.24.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:35400e6a8d102fd07c71ed7dcadd9eb62ee9a6e84ec159bd48c28235bbb0f8e4"}, - {file = "numpy-1.24.3.tar.gz", hash = "sha256:ab344f1bf21f140adab8e47fdbc7c35a477dc01408791f8ba00d018dd0bc5155"}, + {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, + {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"}, + {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"}, + {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"}, + {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"}, + {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"}, + {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"}, + {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"}, + {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"}, + {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, + {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, ] [[package]] @@ -1632,4 +1632,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "fabdd2d7dba563fe7b01b4592dfb33e520b5f6e67317ce5f03205ecba396a577" +content-hash = "4dc5ad121b7efb7ce00474d3b1f7d1a92d27bca1a4fda9b88c4fddc23c872373" diff --git a/pyproject.toml b/pyproject.toml index 6a6d2c9..064f36b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ include = [ [tool.poetry.dependencies] python = "^3.8.1" typing-extensions = "^4.6.3" -numpy = "^1.20.0" +numpy = "^1.24.4" diskcache = "^5.6.1" uvicorn = { version = "^0.22.0", optional = true } fastapi = { version = "^0.97.0", optional = true } From e03c3806f80c9bad3acba6d9c0baae5754a948fc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Jun 2023 21:16:53 +0000 Subject: [PATCH 410/443] Bump mkdocs-material from 9.1.16 to 9.1.17 Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.16 to 9.1.17. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.16...9.1.17) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 041bb03..2977039 100644 --- a/poetry.lock +++ b/poetry.lock @@ -775,13 +775,13 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.16" +version = "9.1.17" description = "Documentation that simply works" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.16-py3-none-any.whl", hash = "sha256:f9e62558a6b01ffac314423cbc223d970c25fbc78999860226245b64e64d6751"}, - {file = "mkdocs_material-9.1.16.tar.gz", hash = "sha256:1021bfea20f00a9423530c8c2ae9be3c78b80f5a527b3f822e6de3d872e5ab79"}, + {file = "mkdocs_material-9.1.17-py3-none-any.whl", hash = "sha256:809ed68427fbab0330b0b07bc93175824c3b98f4187060a5c7b46aa8ae398a75"}, + {file = "mkdocs_material-9.1.17.tar.gz", hash = "sha256:5a076524625047bf4ee4da1509ec90626f8fce915839dc07bdae6b59ff4f36f9"}, ] [package.dependencies] @@ -1632,4 +1632,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "4dc5ad121b7efb7ce00474d3b1f7d1a92d27bca1a4fda9b88c4fddc23c872373" +content-hash = "9d7ba299cf343c9974f78a834cc2506a898996e12cfa3e67a0a4244886678241" diff --git a/pyproject.toml b/pyproject.toml index 064f36b..1286576 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.22.0"} -mkdocs-material = "^9.1.16" +mkdocs-material = "^9.1.17" pytest = "^7.3.2" httpx = "^0.24.1" scikit-build = "0.17.6" From dcf18342944fda04ff64cf405c9d119181428475 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Jun 2023 21:23:22 +0000 Subject: [PATCH 411/443] Bump fastapi from 0.97.0 to 0.98.0 Bumps [fastapi](https://github.com/tiangolo/fastapi) from 0.97.0 to 0.98.0. - [Release notes](https://github.com/tiangolo/fastapi/releases) - [Commits](https://github.com/tiangolo/fastapi/compare/0.97.0...0.98.0) --- updated-dependencies: - dependency-name: fastapi dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2977039..e47eb64 100644 --- a/poetry.lock +++ b/poetry.lock @@ -373,13 +373,13 @@ test = ["pytest (>=6)"] [[package]] name = "fastapi" -version = "0.97.0" +version = "0.98.0" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" optional = true python-versions = ">=3.7" files = [ - {file = "fastapi-0.97.0-py3-none-any.whl", hash = "sha256:95d757511c596409930bd20673358d4a4d709004edb85c5d24d6ffc48fabcbf2"}, - {file = "fastapi-0.97.0.tar.gz", hash = "sha256:b53248ee45f64f19bb7600953696e3edf94b0f7de94df1e5433fc5c6136fa986"}, + {file = "fastapi-0.98.0-py3-none-any.whl", hash = "sha256:f4165fb1fe3610c52cb1b8282c1480de9c34bc270f56a965aa93a884c350d605"}, + {file = "fastapi-0.98.0.tar.gz", hash = "sha256:0d3c18886f652038262b5898fec6b09f4ca92ee23e9d9b1d1d24e429f84bf27b"}, ] [package.dependencies] @@ -1632,4 +1632,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "9d7ba299cf343c9974f78a834cc2506a898996e12cfa3e67a0a4244886678241" +content-hash = "b9bbe76d769fd94486ed6d4985479e38f38cd86ad1a58f61c834cdce2403c4ab" diff --git a/pyproject.toml b/pyproject.toml index 1286576..bdfe9a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ typing-extensions = "^4.6.3" numpy = "^1.24.4" diskcache = "^5.6.1" uvicorn = { version = "^0.22.0", optional = true } -fastapi = { version = "^0.97.0", optional = true } +fastapi = { version = "^0.98.0", optional = true } sse-starlette = { version = "^1.6.1", optional = true } [tool.poetry.group.dev.dependencies] From 89f9e435ba859bd3f1b348e1e1be9630ad17451b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Jun 2023 23:26:11 +0000 Subject: [PATCH 412/443] Bump pytest from 7.3.2 to 7.4.0 Bumps [pytest](https://github.com/pytest-dev/pytest) from 7.3.2 to 7.4.0. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/7.3.2...7.4.0) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index e47eb64..0a3460f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1067,13 +1067,13 @@ pyyaml = "*" [[package]] name = "pytest" -version = "7.3.2" +version = "7.4.0" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.3.2-py3-none-any.whl", hash = "sha256:cdcbd012c9312258922f8cd3f1b62a6580fdced17db6014896053d47cddf9295"}, - {file = "pytest-7.3.2.tar.gz", hash = "sha256:ee990a3cc55ba808b80795a79944756f315c67c12b56abd3ac993a7b8c17030b"}, + {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, + {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, ] [package.dependencies] @@ -1632,4 +1632,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "b9bbe76d769fd94486ed6d4985479e38f38cd86ad1a58f61c834cdce2403c4ab" +content-hash = "f2a6d5c33cb22ec80b093ccd23454a1521ad1949816505465a1c3968360c8cd8" diff --git a/pyproject.toml b/pyproject.toml index bdfe9a6..11bf512 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.22.0"} mkdocs-material = "^9.1.17" -pytest = "^7.3.2" +pytest = "^7.4.0" httpx = "^0.24.1" scikit-build = "0.17.6" From dae983342a679652c4bf03c9a97943f4064bbad3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 27 Jun 2023 12:45:31 -0400 Subject: [PATCH 413/443] Update docs --- docs/api-reference.md | 37 +++++++++++++++++++++++++++++++++++++ docs/index.md | 36 ------------------------------------ 2 files changed, 37 insertions(+), 36 deletions(-) create mode 100644 docs/api-reference.md diff --git a/docs/api-reference.md b/docs/api-reference.md new file mode 100644 index 0000000..2c5dec1 --- /dev/null +++ b/docs/api-reference.md @@ -0,0 +1,37 @@ +--- +title: API Reference +--- + +::: llama_cpp.Llama + options: + members: + - __init__ + - tokenize + - detokenize + - reset + - eval + - sample + - generate + - create_embedding + - embed + - create_completion + - __call__ + - create_chat_completion + - set_cache + - save_state + - load_state + - token_bos + - token_eos + show_root_heading: true + +::: llama_cpp.LlamaCache + options: + show_root_heading: true + +::: llama_cpp.LlamaState + options: + show_root_heading: true + +::: llama_cpp.llama_cpp + options: + show_if_no_docstring: true \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 99b1f59..7d5ccc3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -87,42 +87,6 @@ git submodule update --init --recursive python3 setup.py develop ``` -## API Reference - -::: llama_cpp.Llama - options: - members: - - __init__ - - tokenize - - detokenize - - reset - - eval - - sample - - generate - - create_embedding - - embed - - create_completion - - __call__ - - create_chat_completion - - set_cache - - save_state - - load_state - - token_bos - - token_eos - show_root_heading: true - -::: llama_cpp.LlamaCache - options: - show_root_heading: true - -::: llama_cpp.LlamaState - options: - show_root_heading: true - -::: llama_cpp.llama_cpp - options: - show_if_no_docstring: true - ## License This project is licensed under the terms of the MIT license. \ No newline at end of file From a3766591bb3ebe1f001456926a13d38edda4c2ae Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 27 Jun 2023 13:02:30 -0400 Subject: [PATCH 414/443] Update docs --- docs/{macos_install.md => install/macos.md} | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename docs/{macos_install.md => install/macos.md} (96%) diff --git a/docs/macos_install.md b/docs/install/macos.md similarity index 96% rename from docs/macos_install.md rename to docs/install/macos.md index 33dcb5d..6004696 100644 --- a/docs/macos_install.md +++ b/docs/install/macos.md @@ -1,6 +1,6 @@ - -# llama-cpp-python - MacOS Install with Metal GPU - +--- +title: MacOS Install with Metal GPU +--- **(1) Make sure you have xcode installed... at least the command line parts** ``` From 442213b070e579774c0374d6150fc38e0b3c978d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 28 Jun 2023 21:07:58 -0400 Subject: [PATCH 415/443] Add stopping criteria and logits processor to docs --- docs/api-reference.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/api-reference.md b/docs/api-reference.md index 2c5dec1..1290cad 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -32,6 +32,22 @@ title: API Reference options: show_root_heading: true +::: llama_cpp.LogitsProcessor + options: + show_root_heading: true + +::: llama_cpp.LogitsProcessorList + options: + show_root_heading: true + +::: llama_cpp.StoppingCriteria + options: + show_root_heading: true + +::: llama_cpp.StoppingCriteriaList + options: + show_root_heading: true + ::: llama_cpp.llama_cpp options: show_if_no_docstring: true \ No newline at end of file From a5e059c05371d29e5388b6f81a52a4a3f9209479 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 28 Jun 2023 23:58:55 -0400 Subject: [PATCH 416/443] Free model when llama is unloaded. Closes #434 --- llama_cpp/llama.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index e030b49..2865d27 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1437,6 +1437,9 @@ class Llama: return self._convert_text_completion_to_chat(completion) def __del__(self): + if self.model is not None: + llama_cpp.llama_free_model(self.model) + self.model = None if self.ctx is not None: llama_cpp.llama_free(self.ctx) self.ctx = None From b95b0ffbebfc87e40bab98c4be1fa2fdd2a1aa50 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 29 Jun 2023 00:40:47 -0400 Subject: [PATCH 417/443] Use pre-allocated buffers to store input_ids and scores --- llama_cpp/llama.py | 86 ++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 42 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2865d27..764c91e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -141,7 +141,7 @@ class LlamaDiskCache(BaseLlamaCache): if _key is None: raise KeyError("Key not found") value: "LlamaState" = self.cache.pop(_key) # type: ignore - # NOTE: This puts an integer as key in cache, which breaks, + # NOTE: This puts an integer as key in cache, which breaks, # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens # self.cache.push(_key, side="front") # type: ignore return value @@ -166,17 +166,15 @@ class LlamaDiskCache(BaseLlamaCache): class LlamaState: def __init__( self, - eval_tokens: Deque[int], - eval_logits: Deque[List[float]], input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single], + n_tokens: int, llama_state: bytes, llama_state_size: int, ): - self.eval_tokens = eval_tokens - self.eval_logits = eval_logits self.input_ids = input_ids self.scores = scores + self.n_tokens = n_tokens self.llama_state = llama_state self.llama_state_size = llama_state_size @@ -267,8 +265,6 @@ class Llama: self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) - self.eval_tokens: Deque[int] = deque(maxlen=n_ctx) - self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx if logits_all else 1) self.cache: Optional[BaseLlamaCache] = None @@ -329,8 +325,30 @@ class Llama: self._token_nl = Llama.token_nl() self._token_eos = Llama.token_eos() - self._input_ids = np.array([], dtype=np.intc) - self._scores: npt.NDArray[np.single] = np.ndarray((0, self._n_vocab), dtype=np.single) + self.n_tokens = 0 + self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) + self.scores: npt.NDArray[np.single] = np.ndarray( + (n_ctx, self._n_vocab), dtype=np.single + ) + + @property + def _input_ids(self) -> npt.NDArray[np.intc]: + return self.input_ids[: self.n_tokens] + + @property + def _scores(self) -> npt.NDArray[np.single]: + return self.scores[: self.n_tokens, :] + + @property + def eval_tokens(self) -> Deque[int]: + return deque(self.input_ids[: self.n_tokens].tolist(), maxlen=self._n_ctx) + + @property + def eval_logits(self) -> Deque[List[float]]: + return deque( + self.scores[: self.n_tokens, :].tolist(), + maxlen=self._n_ctx if self.params.logits_all else 1, + ) def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: """Tokenize a string. @@ -397,10 +415,7 @@ class Llama: def reset(self): """Reset the model state.""" - self.eval_tokens.clear() - self.eval_logits.clear() - self._input_ids = np.array([], dtype=np.intc) - self._scores = np.ndarray((0, self._n_vocab), dtype=np.single) + self.n_tokens = 0 def eval(self, tokens: Sequence[int]): """Evaluate a list of tokens. @@ -410,7 +425,6 @@ class Llama: """ assert self.ctx is not None n_ctx = self._n_ctx - scores: List[npt.NDArray[np.single]] = [] for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] n_past = min(n_ctx - len(batch), len(self._input_ids)) @@ -425,19 +439,16 @@ class Llama: if return_code != 0: raise RuntimeError(f"llama_eval returned {return_code}") # Save tokens - self.eval_tokens.extend(batch) - self._input_ids: npt.NDArray[np.intc] = np.concatenate( - (self._input_ids, np.array(batch, dtype=np.intc)), axis=0 - ) + self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch # Save logits rows = n_tokens if self.params.logits_all else 1 n_vocab = self._n_vocab cols = n_vocab logits_view = llama_cpp.llama_get_logits(self.ctx) logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)] - self.eval_logits.extend(logits) - scores.append(np.array(logits, dtype=np.single)) - self._scores = np.concatenate(scores) + self.scores[self.n_tokens : self.n_tokens + n_tokens, :] = logits + # Update n_tokens + self.n_tokens += n_tokens def _sample( self, @@ -457,8 +468,7 @@ class Llama: logits_processor: Optional[LogitsProcessorList] = None, ): assert self.ctx is not None - assert len(self.eval_logits) > 0 - assert self._scores.shape[0] > 0 + assert self.n_tokens > 0 n_vocab = self._n_vocab n_ctx = self._n_ctx top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k @@ -475,7 +485,6 @@ class Llama: dtype=np.single, ) self._scores[-1, :] = logits - self.eval_logits[-1] = logits.tolist() nl_logit = logits[self._token_nl] candidates = self._candidates @@ -672,14 +681,7 @@ class Llama: print("Llama.generate: prefix-match hit", file=sys.stderr) reset = False tokens = tokens[longest_prefix:] - self._input_ids = self._input_ids[:longest_prefix] - self._scores = self._scores[:longest_prefix, :] - for _ in range(len(self.eval_tokens) - longest_prefix): - self.eval_tokens.pop() - try: - self.eval_logits.pop() - except IndexError: - pass + self.n_tokens = longest_prefix if reset: self.reset() @@ -819,7 +821,9 @@ class Llama: llama_cpp.llama_reset_timings(self.ctx) if len(prompt_tokens) > self._n_ctx: - raise ValueError(f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}") + raise ValueError( + f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}" + ) # Truncate max_tokens if requested tokens would exceed the context window max_tokens = ( @@ -1513,22 +1517,20 @@ class Llama: file=sys.stderr, ) return LlamaState( - eval_tokens=self.eval_tokens.copy(), - eval_logits=self.eval_logits.copy(), - scores=self._scores.copy(), - input_ids=self._input_ids.copy(), + scores=self.scores.copy(), + input_ids=self.input_ids.copy(), + n_tokens=self.n_tokens, llama_state=bytes(llama_state_compact), llama_state_size=n_bytes, ) def load_state(self, state: LlamaState) -> None: assert self.ctx is not None - self.eval_tokens = state.eval_tokens.copy() - self.eval_logits = state.eval_logits.copy() - self._scores = state.scores.copy() - self._input_ids = state.input_ids.copy() + self.scores = state.scores.copy() + self.input_ids = state.input_ids.copy() + self.n_tokens = state.n_tokens state_size = state.llama_state_size - LLamaStateArrayType = (llama_cpp.c_uint8 * state_size) + LLamaStateArrayType = llama_cpp.c_uint8 * state_size llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state) if llama_cpp.llama_set_state_data(self.ctx, llama_state) != state_size: From a2ede37bd568aacb2a4d6dbc9bf2d01ba02c77b6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 29 Jun 2023 00:45:46 -0400 Subject: [PATCH 418/443] Load logits directly into scores buffer --- llama_cpp/llama.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 764c91e..8b7dee0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -442,11 +442,8 @@ class Llama: self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch # Save logits rows = n_tokens if self.params.logits_all else 1 - n_vocab = self._n_vocab - cols = n_vocab - logits_view = llama_cpp.llama_get_logits(self.ctx) - logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)] - self.scores[self.n_tokens : self.n_tokens + n_tokens, :] = logits + cols = self._n_vocab + self.scores[self.n_tokens : self.n_tokens + n_tokens, :].reshape(-1)[:] = llama_cpp.llama_get_logits(self.ctx)[:rows * cols] # Update n_tokens self.n_tokens += n_tokens From 4d1eb88b1375136bc2a37a58e927acfa50145804 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 29 Jun 2023 00:46:15 -0400 Subject: [PATCH 419/443] Bump version --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f0d70b4..0ff6cb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.67] + +## Fixed + +- Fix performance bug in Llama model by pre-allocating memory tokens and logits. +- Fix bug in Llama model where the model was not free'd after use. + ## [0.1.66] ## Added diff --git a/pyproject.toml b/pyproject.toml index 11bf512..a392fd4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.66" +version = "0.1.67" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index fbd5551..9559341 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.66", + version="0.1.67", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From e34f4414cf36f24dbe83f2e4baa610f207b5eb4d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 29 Jun 2023 00:57:27 -0400 Subject: [PATCH 420/443] Hotfix: logits_all bug --- llama_cpp/llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 8b7dee0..688b2a7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -443,7 +443,8 @@ class Llama: # Save logits rows = n_tokens if self.params.logits_all else 1 cols = self._n_vocab - self.scores[self.n_tokens : self.n_tokens + n_tokens, :].reshape(-1)[:] = llama_cpp.llama_get_logits(self.ctx)[:rows * cols] + offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False + self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = llama_cpp.llama_get_logits(self.ctx)[:rows * cols] # Update n_tokens self.n_tokens += n_tokens From c67f7863604eedbcb3e17884d1aa3354f7857cf9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 29 Jun 2023 01:08:15 -0400 Subject: [PATCH 421/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 32 +++++++++++++++++++++++++++----- vendor/llama.cpp | 2 +- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 23643e2..52fc14e 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -290,13 +290,14 @@ _lib.llama_mlock_supported.restype = c_bool # // TODO: not great API - very likely to change # // Initialize the llama + ggml backend +# // If numa is true, use NUMA optimizations # // Call once at the start of the program -# LLAMA_API void llama_init_backend(); -def llama_init_backend(): - return _lib.llama_init_backend() +# LLAMA_API void llama_init_backend(bool numa); +def llama_init_backend(numa: c_bool): + return _lib.llama_init_backend(numa) -_lib.llama_init_backend.argtypes = [] +_lib.llama_init_backend.argtypes = [c_bool] _lib.llama_init_backend.restype = None @@ -565,6 +566,27 @@ _lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int] _lib.llama_eval.restype = c_int +# // Same as llama_eval, but use float matrix input directly. +# LLAMA_API int llama_eval_embd( +# struct llama_context * ctx, +# const float * embd, +# int n_tokens, +# int n_past, +# int n_threads); +def llama_eval_embd( + ctx: llama_context_p, + embd, # type: Array[c_float] + n_tokens: c_int, + n_past: c_int, + n_threads: c_int, +) -> int: + return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads) + + +_lib.llama_eval_embd.argtypes = [llama_context_p, c_float_p, c_int, c_int, c_int] +_lib.llama_eval_embd.restype = c_int + + # Convert the provided text into tokens. # The tokens pointer must be large enough to hold the resulting tokens. # Returns the number of tokens on success, no more than n_max_tokens @@ -998,5 +1020,5 @@ _lib.llama_print_system_info.restype = c_char_p _llama_initialized = False if not _llama_initialized: - llama_init_backend() + llama_init_backend(c_bool(False)) _llama_initialized = True diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 447ccbe..96a712c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 447ccbe8c39332fcdd0d98a041b6e2ff6f06219d +Subproject commit 96a712ca1b7f427e3bd7ffc0c70b2105cfc7fbf1 From 485eee7bef0dabcb3dbf3cc7228bc3fa50e75713 Mon Sep 17 00:00:00 2001 From: vladkens Date: Fri, 30 Jun 2023 00:48:21 +0300 Subject: [PATCH 422/443] Update README.md Fix installation link in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bd2e778..c461333 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python ``` -Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md) +Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](docs/install/macos.md) ## High-level API From c8d0647caa926efc76407ed4e4d20192e6ec540a Mon Sep 17 00:00:00 2001 From: Mike Date: Fri, 30 Jun 2023 16:42:13 +0800 Subject: [PATCH 423/443] Update README.md prevent not found errors --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c461333..fb652a9 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,7 @@ To get started, clone the repository and install the package in development mode ```bash git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git +cd llama-cpp-python # Install with pip pip install -e . From fb02077e3f0765b5fd872508b2065a535375b380 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Jul 2023 20:55:32 +0000 Subject: [PATCH 424/443] Bump fastapi from 0.98.0 to 0.99.1 Bumps [fastapi](https://github.com/tiangolo/fastapi) from 0.98.0 to 0.99.1. - [Release notes](https://github.com/tiangolo/fastapi/releases) - [Commits](https://github.com/tiangolo/fastapi/compare/0.98.0...0.99.1) --- updated-dependencies: - dependency-name: fastapi dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 9 +++++---- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 0a3460f..7cbacc8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -373,18 +373,19 @@ test = ["pytest (>=6)"] [[package]] name = "fastapi" -version = "0.98.0" +version = "0.99.1" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" optional = true python-versions = ">=3.7" files = [ - {file = "fastapi-0.98.0-py3-none-any.whl", hash = "sha256:f4165fb1fe3610c52cb1b8282c1480de9c34bc270f56a965aa93a884c350d605"}, - {file = "fastapi-0.98.0.tar.gz", hash = "sha256:0d3c18886f652038262b5898fec6b09f4ca92ee23e9d9b1d1d24e429f84bf27b"}, + {file = "fastapi-0.99.1-py3-none-any.whl", hash = "sha256:976df7bab51ac7beda9f68c4513b8c4490b5c1135c72aafd0a5ee4023ec5282e"}, + {file = "fastapi-0.99.1.tar.gz", hash = "sha256:ac78f717cd80d657bd183f94d33b9bda84aa376a46a9dab513586b8eef1dc6fc"}, ] [package.dependencies] pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0" starlette = ">=0.27.0,<0.28.0" +typing-extensions = ">=4.5.0" [package.extras] all = ["email-validator (>=1.1.1)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] @@ -1632,4 +1633,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "f2a6d5c33cb22ec80b093ccd23454a1521ad1949816505465a1c3968360c8cd8" +content-hash = "19f6d2d9a3cb563de91acc945752274c08177e5ff36c63bca011b9c22c24c9fc" diff --git a/pyproject.toml b/pyproject.toml index a392fd4..0586408 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ typing-extensions = "^4.6.3" numpy = "^1.24.4" diskcache = "^5.6.1" uvicorn = { version = "^0.22.0", optional = true } -fastapi = { version = "^0.98.0", optional = true } +fastapi = { version = "^0.99.1", optional = true } sse-starlette = { version = "^1.6.1", optional = true } [tool.poetry.group.dev.dependencies] From f1b442337d7de5a7b52c26b0ad944428b5953a53 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 4 Jul 2023 18:22:53 +0000 Subject: [PATCH 425/443] Bump typing-extensions from 4.6.3 to 4.7.1 Bumps [typing-extensions](https://github.com/python/typing_extensions) from 4.6.3 to 4.7.1. - [Release notes](https://github.com/python/typing_extensions/releases) - [Changelog](https://github.com/python/typing_extensions/blob/main/CHANGELOG.md) - [Commits](https://github.com/python/typing_extensions/compare/4.6.3...4.7.1) --- updated-dependencies: - dependency-name: typing-extensions dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7cbacc8..8b86d0e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1504,13 +1504,13 @@ urllib3 = ">=1.26.0" [[package]] name = "typing-extensions" -version = "4.6.3" +version = "4.7.1" description = "Backported and Experimental Type Hints for Python 3.7+" optional = false python-versions = ">=3.7" files = [ - {file = "typing_extensions-4.6.3-py3-none-any.whl", hash = "sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26"}, - {file = "typing_extensions-4.6.3.tar.gz", hash = "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"}, + {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, + {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, ] [[package]] @@ -1633,4 +1633,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "19f6d2d9a3cb563de91acc945752274c08177e5ff36c63bca011b9c22c24c9fc" +content-hash = "ed454fad4bd4ea920624c1bcdf2beb74bdb8e9394c22156234c8bc0fde770bd8" diff --git a/pyproject.toml b/pyproject.toml index 0586408..e79d72e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ include = [ [tool.poetry.dependencies] python = "^3.8.1" -typing-extensions = "^4.6.3" +typing-extensions = "^4.7.1" numpy = "^1.24.4" diskcache = "^5.6.1" uvicorn = { version = "^0.22.0", optional = true } From 9261a529165d1bea2bc810224aa3aa4dc71e69fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 4 Jul 2023 21:21:09 +0000 Subject: [PATCH 426/443] Bump mkdocs-material from 9.1.17 to 9.1.18 Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.17 to 9.1.18. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.17...9.1.18) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 8b86d0e..9d12966 100644 --- a/poetry.lock +++ b/poetry.lock @@ -776,13 +776,13 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.17" +version = "9.1.18" description = "Documentation that simply works" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.17-py3-none-any.whl", hash = "sha256:809ed68427fbab0330b0b07bc93175824c3b98f4187060a5c7b46aa8ae398a75"}, - {file = "mkdocs_material-9.1.17.tar.gz", hash = "sha256:5a076524625047bf4ee4da1509ec90626f8fce915839dc07bdae6b59ff4f36f9"}, + {file = "mkdocs_material-9.1.18-py3-none-any.whl", hash = "sha256:5bcf8fb79ac2f253c0ffe93fa181cba87718c6438f459dc4180ac7418cc9a450"}, + {file = "mkdocs_material-9.1.18.tar.gz", hash = "sha256:981dd39979723d4cda7cfc77bbbe5e54922d5761a7af23fb8ba9edb52f114b13"}, ] [package.dependencies] @@ -1633,4 +1633,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "ed454fad4bd4ea920624c1bcdf2beb74bdb8e9394c22156234c8bc0fde770bd8" +content-hash = "da42c48a426b64ce393b4febca1be0e2ea0fe9d48cedb2392b390d4a49276474" diff --git a/pyproject.toml b/pyproject.toml index e79d72e..73e46a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.22.0"} -mkdocs-material = "^9.1.17" +mkdocs-material = "^9.1.18" pytest = "^7.4.0" httpx = "^0.24.1" scikit-build = "0.17.6" From b994296c7576067d0862247c284dbf6eae96a46f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 5 Jul 2023 01:00:14 -0400 Subject: [PATCH 427/443] Update llama.cpp --- llama_cpp/llama_cpp.py | 28 ++++++++++++++++------------ vendor/llama.cpp | 2 +- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 52fc14e..c68fb18 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -5,6 +5,8 @@ from ctypes import ( c_int, c_float, c_char_p, + c_int32, + c_uint32, c_void_p, c_bool, POINTER, @@ -105,6 +107,9 @@ LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_VERSION = c_int(1) +# #define LLAMA_DEFAULT_SEED 0xFFFFFFFF +LLAMA_DEFAULT_SEED = c_int(0xFFFFFFFF) + # struct llama_model; llama_model_p = c_void_p @@ -153,18 +158,17 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # struct llama_context_params { -# int seed; // RNG seed, -1 for random -# int n_ctx; // text context -# int n_batch; // prompt processing batch size -# int n_gpu_layers; // number of layers to store in VRAM -# int main_gpu; // the GPU that is used for scratch and small tensors +# uint32_t seed; // RNG seed, -1 for random +# int32_t n_ctx; // text context +# int32_t n_batch; // prompt processing batch size +# int32_t n_gpu_layers; // number of layers to store in VRAM +# int32_t main_gpu; // the GPU that is used for scratch and small tensors # float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs # // called with a progress value between 0 and 1, pass NULL to disable # llama_progress_callback progress_callback; # // context pointer passed to the progress callback # void * progress_callback_user_data; - # // Keep the booleans together to avoid misalignment during copy-by-value. # bool low_vram; // if true, reduce VRAM usage at the cost of performance # bool f16_kv; // use fp16 for KV cache @@ -176,11 +180,11 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # }; class llama_context_params(Structure): _fields_ = [ - ("seed", c_int), - ("n_ctx", c_int), - ("n_batch", c_int), - ("n_gpu_layers", c_int), - ("main_gpu", c_int), + ("seed", c_uint32), + ("n_ctx", c_int32), + ("n_batch", c_int32), + ("n_gpu_layers", c_int32), + ("main_gpu", c_int32), ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), ("progress_callback", llama_progress_callback), ("progress_callback_user_data", c_void_p), @@ -453,7 +457,7 @@ _lib.llama_get_kv_cache_token_count.restype = c_int # Sets the current rng seed. # LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); -def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): +def llama_set_rng_seed(ctx: llama_context_p, seed: c_uint32): return _lib.llama_set_rng_seed(ctx, seed) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 96a712c..7f0e9a7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 96a712ca1b7f427e3bd7ffc0c70b2105cfc7fbf1 +Subproject commit 7f0e9a775ecc4c6ade271c217f63d6dc93e79eaa From a1b2d5c09b9061e265b504fc6307559f89a8589c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 5 Jul 2023 01:06:46 -0400 Subject: [PATCH 428/443] Bump version --- CHANGELOG.md | 6 ++++++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ff6cb8..c6cfaab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.68] + +## [Added] + +- (llama.cpp) Update llama.cpp + ## [0.1.67] ## Fixed diff --git a/pyproject.toml b/pyproject.toml index 73e46a1..b3ad3b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.67" +version = "0.1.68" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 9559341..32101eb 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.67", + version="0.1.68", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 98ae4e58a3adce4b3cf775121ee1f1ac2ce5ddb6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 6 Jul 2023 17:57:56 -0400 Subject: [PATCH 429/443] Update llama.cpp --- Makefile | 3 +++ llama_cpp/llama_cpp.py | 39 +++++++++++++++++++++++++++++++++++++++ vendor/llama.cpp | 2 +- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 66d93f3..1be35cf 100644 --- a/Makefile +++ b/Makefile @@ -33,6 +33,9 @@ deploy.gh-docs: mkdocs build mkdocs gh-deploy +test: + python3 -m pytest + clean: - cd vendor/llama.cpp && make clean - cd vendor/llama.cpp && rm libllama.so diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index c68fb18..17c6319 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2,6 +2,7 @@ import sys import os import ctypes from ctypes import ( + c_double, c_int, c_float, c_char_p, @@ -169,6 +170,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # // context pointer passed to the progress callback # void * progress_callback_user_data; + # // Keep the booleans together to avoid misalignment during copy-by-value. # bool low_vram; // if true, reduce VRAM usage at the cost of performance # bool f16_kv; // use fp16 for KV cache @@ -256,6 +258,34 @@ class llama_model_quantize_params(Structure): ] +# // performance timing information +# struct llama_timings { +# double t_start_ms; +# double t_end_ms; +# double t_load_ms; +# double t_sample_ms; +# double t_p_eval_ms; +# double t_eval_ms; + + +# int32_t n_sample; +# int32_t n_p_eval; +# int32_t n_eval; +# }; +class llama_timings(Structure): + _fields_ = [ + ("t_start_ms", c_double), + ("t_end_ms", c_double), + ("t_load_ms", c_double), + ("t_sample_ms", c_double), + ("t_p_eval_ms", c_double), + ("t_eval_ms", c_double), + ("n_sample", c_int32), + ("n_p_eval", c_int32), + ("n_eval", c_int32), + ] + + # LLAMA_API struct llama_context_params llama_context_default_params(); def llama_context_default_params() -> llama_context_params: return _lib.llama_context_default_params() @@ -991,6 +1021,15 @@ _lib.llama_sample_token.restype = llama_token # Performance information +# LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); +def llama_get_timings(ctx: llama_context_p) -> llama_timings: + return _lib.llama_get_timings(ctx) + + +_lib.llama_get_timings.argtypes = [llama_context_p] +_lib.llama_get_timings.restype = llama_timings + + # LLAMA_API void llama_print_timings(struct llama_context * ctx); def llama_print_timings(ctx: llama_context_p): _lib.llama_print_timings(ctx) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7f0e9a7..dfd9fce 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7f0e9a775ecc4c6ade271c217f63d6dc93e79eaa +Subproject commit dfd9fce6d65599bf33df43e616e85aa639bdae4c From 4c7cdcca00f63896a95e09a11f424237e224bc72 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 03:04:17 -0400 Subject: [PATCH 430/443] Add interruptible streaming requests for llama-cpp-python server. Closes #183 --- CHANGELOG.md | 4 ++++ llama_cpp/server/app.py | 31 +++++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6cfaab..11251c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [Added] + +- (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting. + ## [0.1.68] ## [Added] diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ef319c7..b9d5771 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -146,12 +146,27 @@ def create_app(settings: Optional[Settings] = None): return app -llama_lock = Lock() +llama_outer_lock = Lock() +llama_inner_lock = Lock() def get_llama(): - with llama_lock: - yield llama + # NOTE: This double lock allows the currently streaming llama model to + # check if any other requests are pending in the same thread and cancel + # the stream if so. + llama_outer_lock.acquire() + release_outer_lock = True + try: + llama_inner_lock.acquire() + try: + llama_outer_lock.release() + release_outer_lock = False + yield llama + finally: + llama_inner_lock.release() + finally: + if release_outer_lock: + llama_outer_lock.release() def get_settings(): @@ -364,6 +379,9 @@ async def create_completion( await inner_send_chan.send(dict(data=json.dumps(chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() + if llama_outer_lock.locked(): + await inner_send_chan.send(dict(data="[DONE]")) + raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) except anyio.get_cancelled_exc_class() as e: print("disconnected") @@ -371,7 +389,6 @@ async def create_completion( print( f"Disconnected from client (via refresh/close) {request.client}" ) - await inner_send_chan.send(dict(closing=True)) raise e return EventSourceResponse( @@ -494,6 +511,9 @@ async def create_chat_completion( await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() + if llama_outer_lock.locked(): + await inner_send_chan.send(dict(data="[DONE]")) + raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) except anyio.get_cancelled_exc_class() as e: print("disconnected") @@ -501,7 +521,6 @@ async def create_chat_completion( print( f"Disconnected from client (via refresh/close) {request.client}" ) - await inner_send_chan.send(dict(closing=True)) raise e return EventSourceResponse( @@ -533,8 +552,8 @@ GetModelResponse = create_model_from_typeddict(ModelList) @router.get("/v1/models", response_model=GetModelResponse) async def get_models( settings: Settings = Depends(get_settings), - llama: llama_cpp.Llama = Depends(get_llama), ) -> ModelList: + assert llama is not None return { "object": "list", "data": [ From cc542b4452ec92919bb2964e40314c7077c264be Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 03:04:54 -0400 Subject: [PATCH 431/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index dfd9fce..481f793 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit dfd9fce6d65599bf33df43e616e85aa639bdae4c +Subproject commit 481f793acc3882a09d45d8d2c3076ad3d1c60cfc From 57d8ec3899f2c48def77f8cf3d3feae45ca12aa3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 03:37:23 -0400 Subject: [PATCH 432/443] Add setting to control request interruption --- llama_cpp/server/app.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index b9d5771..5d47160 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -85,6 +85,10 @@ class Settings(BaseSettings): port: int = Field( default=8000, description="Listen port" ) + interrupt_requests: bool = Field( + default=True, + description="Whether to interrupt requests when a new request is received.", + ) router = APIRouter() @@ -379,7 +383,7 @@ async def create_completion( await inner_send_chan.send(dict(data=json.dumps(chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() - if llama_outer_lock.locked(): + if settings.interrupt_requests and llama_outer_lock.locked(): await inner_send_chan.send(dict(data="[DONE]")) raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) @@ -486,6 +490,7 @@ async def create_chat_completion( request: Request, body: CreateChatCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), + settings: Settings = Depends(get_settings), ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: exclude = { "n", @@ -511,7 +516,7 @@ async def create_chat_completion( await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() - if llama_outer_lock.locked(): + if settings.interrupt_requests and llama_outer_lock.locked(): await inner_send_chan.send(dict(data="[DONE]")) raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) From ca11673061ecd9198b4800f68073ae14d4440ecd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 03:38:51 -0400 Subject: [PATCH 433/443] Add universal docker image --- Makefile | 7 +++++++ docker/simple/Dockerfile | 33 +++++++++++++++++++++++++++++++++ docker/simple/run.sh | 4 ++++ 3 files changed, 44 insertions(+) create mode 100644 docker/simple/Dockerfile create mode 100644 docker/simple/run.sh diff --git a/Makefile b/Makefile index 1be35cf..c359260 100644 --- a/Makefile +++ b/Makefile @@ -36,6 +36,12 @@ deploy.gh-docs: test: python3 -m pytest +docker: + docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile . + +run-server: + uvicorn --factory llama.server:app --host ${HOST} --port ${PORT} + clean: - cd vendor/llama.cpp && make clean - cd vendor/llama.cpp && rm libllama.so @@ -56,4 +62,5 @@ clean: build.sdist \ deploy.pypi \ deploy.gh-docs \ + docker \ clean \ No newline at end of file diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile new file mode 100644 index 0000000..ad36b98 --- /dev/null +++ b/docker/simple/Dockerfile @@ -0,0 +1,33 @@ +# Define the image argument and provide a default value +ARG IMAGE=python:3-slim-bullseye + +# Use the image as specified +FROM ${IMAGE} + +# Re-declare the ARG after FROM +ARG IMAGE + +# Update and upgrade the existing packages +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + ninja-build \ + build-essential + +RUN mkdir /app +WORKDIR /app +COPY . /app + +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette + +RUN make build && make clean + +# Set environment variable for the host +ENV HOST=0.0.0.0 +ENV PORT=8000 + +# Expose a port for the server +EXPOSE 8000 + +# Run the server start script +CMD ["/bin/sh", "/app/docker/simple/run.sh"] diff --git a/docker/simple/run.sh b/docker/simple/run.sh new file mode 100644 index 0000000..c85e73d --- /dev/null +++ b/docker/simple/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +make build +uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT From d270ec231ad620beeb20da93de3b05f7a2d55cb4 Mon Sep 17 00:00:00 2001 From: Audrey Roy Greenfeld Date: Fri, 7 Jul 2023 11:15:04 +0100 Subject: [PATCH 434/443] Update macOS Metal GPU step 4 * Update "today" to version 0.1.62 * Fix numbering (there were 2 step 4's) --- docs/install/macos.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/install/macos.md b/docs/install/macos.md index 6004696..3330396 100644 --- a/docs/install/macos.md +++ b/docs/install/macos.md @@ -26,19 +26,19 @@ conda create -n llama python=3.9.16 conda activate llama ``` -**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU** +**(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62** *(you needed xcode installed in order pip to build/compile the C++ code)* ``` pip uninstall llama-cpp-python -y CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir pip install 'llama-cpp-python[server]' -# you should now have llama-cpp-python v0.1.62 installed -llama-cpp-python         0.1.62      +# you should now have llama-cpp-python v0.1.62 or higher installed +llama-cpp-python         0.1.68 ``` -**(4) Download a v3 ggml model** +**(5) Download a v3 ggml model** - **ggmlv3** - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0 From 9e61661518d78973555cb0424d371e943674cd88 Mon Sep 17 00:00:00 2001 From: wu-qing-157 Date: Fri, 7 Jul 2023 10:18:49 +0000 Subject: [PATCH 435/443] fix indexing token_logprobs after sorting --- llama_cpp/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 688b2a7..31d70b7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -958,7 +958,7 @@ class Llama: ) ], "text_offset": [text_offset], - "token_logprobs": [sorted_logprobs[int(token)][0]], + "token_logprobs": [current_logprobs[int(token)]], "top_logprobs": [top_logprob], } returned_tokens += 1 @@ -1033,7 +1033,7 @@ class Llama: self.detokenize([token]).decode("utf-8", errors="ignore") ], "text_offset": [text_offset], - "token_logprobs": [sorted_logprobs[int(token)][0]], + "token_logprobs": [current_logprobs[int(token)]], "top_logprobs": [top_logprob], } @@ -1131,7 +1131,7 @@ class Llama: zip(logprobs_token, range(len(logprobs_token))), reverse=True ) ) - token_logprobs.append(sorted_logprobs[int(token)][0]) + token_logprobs.append(logprobs_token[int(token)]) top_logprob: Optional[Dict[str, float]] = { self.detokenize([i]).decode("utf-8", errors="ignore"): logprob for logprob, i in sorted_logprobs[:logprobs] From a14d8a9b3fdc2f967c7c8905fe7911bddb0935a0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 18:58:43 -0400 Subject: [PATCH 436/443] perf: assign to candidates data structure instead --- llama_cpp/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 688b2a7..35823cf 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -487,9 +487,9 @@ class Llama: nl_logit = logits[self._token_nl] candidates = self._candidates candidates_data = self._candidates_data - candidates_data["id"] = np.arange(n_vocab, dtype=np.intc) # type: ignore - candidates_data["logit"] = logits - candidates_data["p"] = np.zeros(n_vocab, dtype=np.single) + candidates_data["id"][:] = np.arange(n_vocab, dtype=np.intc) # type: ignore + candidates_data["logit"][:] = logits + candidates_data["p"][:] = np.zeros(n_vocab, dtype=np.single) candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p) candidates.sorted = llama_cpp.c_bool(False) candidates.size = llama_cpp.c_size_t(n_vocab) From 7887376bffec533083f2d2170424db076089c39d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 19:06:54 -0400 Subject: [PATCH 437/443] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 481f793..061f5f8 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 481f793acc3882a09d45d8d2c3076ad3d1c60cfc +Subproject commit 061f5f8d2109bb7adcbd40f1b456d887c5a1df25 From 11eae752110f3f69088c6a551c965f42f1507148 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 19:28:53 -0400 Subject: [PATCH 438/443] perf: avoid allocating new buffers during sampling --- llama_cpp/llama.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 35823cf..0895182 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -324,6 +324,8 @@ class Llama: self._candidates = candidates self._token_nl = Llama.token_nl() self._token_eos = Llama.token_eos() + self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore + self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single) self.n_tokens = 0 self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) @@ -487,9 +489,9 @@ class Llama: nl_logit = logits[self._token_nl] candidates = self._candidates candidates_data = self._candidates_data - candidates_data["id"][:] = np.arange(n_vocab, dtype=np.intc) # type: ignore + candidates_data["id"][:] = self._candidates_data_id # type: ignore candidates_data["logit"][:] = logits - candidates_data["p"][:] = np.zeros(n_vocab, dtype=np.single) + candidates_data["p"][:] = self._candidates_data_p # type: ignore candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p) candidates.sorted = llama_cpp.c_bool(False) candidates.size = llama_cpp.c_size_t(n_vocab) From 52753b77f556c46057f5272b2ee547868cf53397 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 21:38:46 -0400 Subject: [PATCH 439/443] Upgrade fastapi to 0.100.0 and pydantic v2 --- .github/workflows/test.yaml | 6 +++--- docker/cuda_simple/Dockerfile | 2 +- docker/open_llama/Dockerfile | 2 +- docker/openblas_simple/Dockerfile | 2 +- docker/simple/Dockerfile | 2 +- llama_cpp/server/__main__.py | 4 ++-- llama_cpp/server/app.py | 14 ++++---------- pyproject.toml | 2 +- setup.py | 2 +- 9 files changed, 15 insertions(+), 21 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 56524e0..a73e347 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -26,7 +26,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings pip install . -v - name: Test with pytest run: | @@ -49,7 +49,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings pip install . -v - name: Test with pytest run: | @@ -72,7 +72,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings pip install . -v - name: Test with pytest run: | diff --git a/docker/cuda_simple/Dockerfile b/docker/cuda_simple/Dockerfile index 24906d5..e4a2f07 100644 --- a/docker/cuda_simple/Dockerfile +++ b/docker/cuda_simple/Dockerfile @@ -8,7 +8,7 @@ COPY . . # Install the package RUN apt update && apt install -y python3 python3-pip -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings RUN LLAMA_CUBLAS=1 pip install llama-cpp-python diff --git a/docker/open_llama/Dockerfile b/docker/open_llama/Dockerfile index f0ef5f7..7788f33 100644 --- a/docker/open_llama/Dockerfile +++ b/docker/open_llama/Dockerfile @@ -14,7 +14,7 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco ninja-build \ build-essential -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings # Perform the conditional installations based on the image RUN echo "Image: ${IMAGE}" && \ diff --git a/docker/openblas_simple/Dockerfile b/docker/openblas_simple/Dockerfile index 1a95cae..8231bdb 100644 --- a/docker/openblas_simple/Dockerfile +++ b/docker/openblas_simple/Dockerfile @@ -7,7 +7,7 @@ COPY . . # Install the package RUN apt update && apt install -y libopenblas-dev ninja-build build-essential -RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette +RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile index ad36b98..77680c8 100644 --- a/docker/simple/Dockerfile +++ b/docker/simple/Dockerfile @@ -18,7 +18,7 @@ RUN mkdir /app WORKDIR /app COPY . /app -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings RUN make build && make clean diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 748a2af..2110db3 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -3,7 +3,7 @@ To run this example: ```bash -pip install fastapi uvicorn sse-starlette +pip install fastapi uvicorn sse-starlette pydantic-settings export MODEL=../models/7B/... ``` @@ -30,7 +30,7 @@ from llama_cpp.server.app import create_app, Settings if __name__ == "__main__": parser = argparse.ArgumentParser() - for name, field in Settings.__fields__.items(): + for name, field in Settings.__model_fields__.items(): description = field.field_info.description if field.default is not None and description is not None: description += f" (default: {field.default})" diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 5d47160..ffd07fa 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -12,7 +12,8 @@ from anyio.streams.memory import MemoryObjectSendStream from starlette.concurrency import run_in_threadpool, iterate_in_threadpool from fastapi import Depends, FastAPI, APIRouter, Request from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict +from pydantic import BaseModel, Field +from pydantic_settings import BaseSettings from sse_starlette.sse import EventSourceResponse @@ -309,7 +310,6 @@ class CreateCompletionRequest(BaseModel): } -CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) def make_logit_bias_processor( @@ -347,7 +347,6 @@ def make_logit_bias_processor( @router.post( "/v1/completions", - response_model=CreateCompletionResponse, ) async def create_completion( request: Request, @@ -416,12 +415,10 @@ class CreateEmbeddingRequest(BaseModel): } -CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) @router.post( "/v1/embeddings", - response_model=CreateEmbeddingResponse, ) async def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) @@ -479,19 +476,17 @@ class CreateChatCompletionRequest(BaseModel): } -CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) @router.post( "/v1/chat/completions", - response_model=CreateChatCompletionResponse, ) async def create_chat_completion( request: Request, body: CreateChatCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), settings: Settings = Depends(get_settings), -) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: +) -> Union[llama_cpp.ChatCompletion]: # type: ignore exclude = { "n", "logit_bias", @@ -551,10 +546,9 @@ class ModelList(TypedDict): data: List[ModelData] -GetModelResponse = create_model_from_typeddict(ModelList) -@router.get("/v1/models", response_model=GetModelResponse) +@router.get("/v1/models") async def get_models( settings: Settings = Depends(get_settings), ) -> ModelList: diff --git a/pyproject.toml b/pyproject.toml index b3ad3b4..841a868 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ httpx = "^0.24.1" scikit-build = "0.17.6" [tool.poetry.extras] -server = ["uvicorn", "fastapi", "sse-starlette"] +server = ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"] [build-system] requires = [ diff --git a/setup.py b/setup.py index 32101eb..1d7ecbc 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ setup( packages=["llama_cpp", "llama_cpp.server"], install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], extras_require={ - "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], + "server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"], }, python_requires=">=3.7", classifiers=[ From 34c505edf2609acef51b47533f10cd2b8dc2f715 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 22:54:07 -0400 Subject: [PATCH 440/443] perf: convert pointer to byref --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 0895182..130e013 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -537,7 +537,7 @@ class Llama: mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) llama_cpp.llama_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore temp=temp, ) return llama_cpp.llama_sample_token_mirostat_v2( From ea4fbadab39548673e2a835223968b023006e539 Mon Sep 17 00:00:00 2001 From: AgentJ-WR <60302956+AgentJ-WR@users.noreply.github.com> Date: Fri, 7 Jul 2023 23:24:57 -0400 Subject: [PATCH 441/443] Show how to adjust context window in README.md --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index fb652a9..0322c73 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,15 @@ Below is a short example demonstrating how to use the high-level API to generate } ``` +### Adjusting the Context Window +The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements. + +For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object: + +```python +llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048) +``` + ## Web Server `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. From 4f2b5d0b5321bedc879ee9b9a19ca15d18ddb995 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 8 Jul 2023 00:05:10 -0400 Subject: [PATCH 442/443] Format --- llama_cpp/llama.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 130e013..f8e0527 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -324,7 +324,7 @@ class Llama: self._candidates = candidates self._token_nl = Llama.token_nl() self._token_eos = Llama.token_eos() - self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore + self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single) self.n_tokens = 0 @@ -445,8 +445,12 @@ class Llama: # Save logits rows = n_tokens if self.params.logits_all else 1 cols = self._n_vocab - offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False - self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = llama_cpp.llama_get_logits(self.ctx)[:rows * cols] + offset = ( + 0 if self.params.logits_all else n_tokens - 1 + ) # NOTE: Only save the last token logits if logits_all is False + self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape( + -1 + )[:] = llama_cpp.llama_get_logits(self.ctx)[: rows * cols] # Update n_tokens self.n_tokens += n_tokens @@ -491,7 +495,7 @@ class Llama: candidates_data = self._candidates_data candidates_data["id"][:] = self._candidates_data_id # type: ignore candidates_data["logit"][:] = logits - candidates_data["p"][:] = self._candidates_data_p # type: ignore + candidates_data["p"][:] = self._candidates_data_p # type: ignore candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p) candidates.sorted = llama_cpp.c_bool(False) candidates.size = llama_cpp.c_size_t(n_vocab) @@ -537,7 +541,7 @@ class Llama: mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) llama_cpp.llama_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore temp=temp, ) return llama_cpp.llama_sample_token_mirostat_v2( From d6e6aad927690d4bb3229be3f7980a64e46d4866 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 8 Jul 2023 00:06:11 -0400 Subject: [PATCH 443/443] bugfix: fix compatibility bug with openai api on last token --- llama_cpp/llama.py | 36 ++++++++++++++++++++++++++++++++---- llama_cpp/llama_types.py | 6 ++++-- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index f8e0527..d7d3e85 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1060,6 +1060,20 @@ class Llama: ].decode("utf-8", errors="ignore"), "index": 0, "logprobs": logprobs_or_none, + "finish_reason": None, + } + ], + } + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": "", + "index": 0, + "logprobs": None, "finish_reason": finish_reason, } ], @@ -1078,9 +1092,21 @@ class Llama: ), "index": 0, "logprobs": logprobs_or_none, - "finish_reason": finish_reason - if returned_tokens == len(completion_tokens) - else None, + "finish_reason": None, + } + ], + } + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": "", + "index": 0, + "logprobs": None, + "finish_reason": finish_reason, } ], } @@ -1370,7 +1396,9 @@ class Llama: "index": 0, "delta": { "content": chunk["choices"][0]["text"], - }, + } + if chunk["choices"][0]["finish_reason"] is None + else {}, "finish_reason": chunk["choices"][0]["finish_reason"], } ], diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 7729ced..6ba8023 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Dict +from typing import Any, List, Optional, Dict, Union from typing_extensions import TypedDict, NotRequired, Literal @@ -77,6 +77,8 @@ class ChatCompletion(TypedDict): choices: List[ChatCompletionChoice] usage: CompletionUsage +class ChatCompletionChunkDeltaEmpty(TypedDict): + pass class ChatCompletionChunkDelta(TypedDict): role: NotRequired[Literal["assistant"]] @@ -85,7 +87,7 @@ class ChatCompletionChunkDelta(TypedDict): class ChatCompletionChunkChoice(TypedDict): index: int - delta: ChatCompletionChunkDelta + delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty] finish_reason: Optional[str]