From eb7645b3ba84e182a903663d68c0b4864b670f9b Mon Sep 17 00:00:00 2001 From: Tanner Hobson Date: Fri, 9 Jun 2023 13:13:08 -0400 Subject: [PATCH 01/27] Add support for logit_bias and logit_bias_type parameters --- llama_cpp/llama.py | 2 ++ llama_cpp/server/app.py | 53 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 02fe774..197511c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1380,6 +1380,7 @@ class Llama: mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, model: Optional[str] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -1421,6 +1422,7 @@ class Llama: mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, model=model, + logits_processor=logits_processor, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index f70d8f0..a6194f5 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -249,13 +249,14 @@ class CreateCompletionRequest(BaseModel): ) presence_penalty: Optional[float] = presence_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field + logit_bias: Optional[Dict[str, float]] = Field(None) + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) # ignored or currently unsupported model: Optional[str] = model_field n: Optional[int] = 1 logprobs: Optional[int] = Field(None) best_of: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(None) user: Optional[str] = Field(None) # llama.cpp specific parameters @@ -274,6 +275,39 @@ class CreateCompletionRequest(BaseModel): CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) +def make_logit_bias_processor( + llama: llama_cpp.Llama, + logit_bias: Dict[str, float], + logit_bias_type: Optional[Literal["input_ids", "tokens"]], +): + if logit_bias_type is None: + logit_bias_type = "input_ids" + + to_bias: Dict[int, float] = {} + if logit_bias_type == "input_ids": + for input_id, score in logit_bias.items(): + input_id = int(input_id) + to_bias[input_id] = score + + elif logit_bias_type == "tokens": + for token, score in logit_bias.items(): + token = token.encode('utf-8') + for input_id in llama.tokenize(token, add_bos=False): + to_bias[input_id] = score + + def logit_bias_processor( + input_ids: List[int], + scores: List[float], + ) -> List[float]: + new_scores = [None] * len(scores) + for input_id, score in enumerate(scores): + new_scores[input_id] = score + to_bias.get(input_id, 0.0) + + return new_scores + + return logit_bias_processor + + @router.post( "/v1/completions", response_model=CreateCompletionResponse, @@ -291,9 +325,16 @@ async def create_completion( "n", "best_of", "logit_bias", + "logit_bias_type", "user", } kwargs = body.dict(exclude=exclude) + + if body.logit_bias is not None: + kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ + make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type), + ]) + if body.stream: send_chan, recv_chan = anyio.create_memory_object_stream(10) @@ -372,11 +413,12 @@ class CreateChatCompletionRequest(BaseModel): stream: bool = stream_field presence_penalty: Optional[float] = presence_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field + logit_bias: Optional[Dict[str, float]] = Field(None) + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) # ignored or currently unsupported model: Optional[str] = model_field n: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(None) user: Optional[str] = Field(None) # llama.cpp specific parameters @@ -413,9 +455,16 @@ async def create_chat_completion( exclude = { "n", "logit_bias", + "logit_bias_type", "user", } kwargs = body.dict(exclude=exclude) + + if body.logit_bias is not None: + kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ + make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type), + ]) + if body.stream: send_chan, recv_chan = anyio.create_memory_object_stream(10) From 3ea31930e57a45a0806488950e841efbb575369a Mon Sep 17 00:00:00 2001 From: Gabor Date: Sun, 11 Jun 2023 00:58:08 +0100 Subject: [PATCH 02/27] fixes abetlen/llama-cpp-python #358 --- llama_cpp/server/__main__.py | 2 +- llama_cpp/server/app.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 4fe1d94..1de4548 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -46,5 +46,5 @@ if __name__ == "__main__": app = create_app(settings=settings) uvicorn.run( - app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) + app, host=settings.host, port=settings.port ) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index f70d8f0..2191005 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -72,6 +72,12 @@ class Settings(BaseSettings): verbose: bool = Field( default=True, description="Whether to print debug information." ) + host: str = Field( + default="localhost", description="Listen address" + ) + port: int = Field( + default=8000, description="Listen port" + ) router = APIRouter() From 3129a0e7e581f6edd29a497a13ab014687867134 Mon Sep 17 00:00:00 2001 From: Gabor Date: Sun, 11 Jun 2023 01:11:24 +0100 Subject: [PATCH 03/27] correction to add back environment variable support <3 docker --- llama_cpp/server/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 1de4548..748a2af 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -46,5 +46,5 @@ if __name__ == "__main__": app = create_app(settings=settings) uvicorn.run( - app, host=settings.host, port=settings.port + app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port)) ) From efcf380490af7007389df698ddfe1b0f755e7069 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Jun 2023 21:03:40 +0000 Subject: [PATCH 04/27] Bump fastapi from 0.96.0 to 0.97.0 Bumps [fastapi](https://github.com/tiangolo/fastapi) from 0.96.0 to 0.97.0. - [Release notes](https://github.com/tiangolo/fastapi/releases) - [Commits](https://github.com/tiangolo/fastapi/compare/0.96.0...0.97.0) --- updated-dependencies: - dependency-name: fastapi dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 112 ++++++++++--------------------------------------- pyproject.toml | 2 +- 2 files changed, 23 insertions(+), 91 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4a9c572..1d95d76 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "anyio" version = "3.6.2" description = "High level compatibility layer for multiple asynchronous event loop implementations" -category = "main" optional = false python-versions = ">=3.6.2" files = [ @@ -25,7 +24,6 @@ trio = ["trio (>=0.16,<0.22)"] name = "black" version = "23.3.0" description = "The uncompromising code formatter." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -75,7 +73,6 @@ uvloop = ["uvloop (>=0.15.2)"] name = "bleach" version = "6.0.0" description = "An easy safelist-based HTML-sanitizing tool." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -94,7 +91,6 @@ css = ["tinycss2 (>=1.1.0,<1.2)"] name = "certifi" version = "2023.5.7" description = "Python package for providing Mozilla's CA Bundle." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -106,7 +102,6 @@ files = [ name = "cffi" version = "1.15.1" description = "Foreign Function Interface for Python calling C code." -category = "dev" optional = false python-versions = "*" files = [ @@ -183,7 +178,6 @@ pycparser = "*" name = "charset-normalizer" version = "3.1.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "dev" optional = false python-versions = ">=3.7.0" files = [ @@ -268,7 +262,6 @@ files = [ name = "click" version = "8.1.3" description = "Composable command line interface toolkit" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -283,7 +276,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -295,7 +287,6 @@ files = [ name = "cryptography" version = "40.0.2" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -333,11 +324,21 @@ test = ["iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-co test-randomorder = ["pytest-randomly"] tox = ["tox"] +[[package]] +name = "diskcache" +version = "5.6.1" +description = "Disk Cache -- Disk and file backed persistent cache." +optional = false +python-versions = ">=3" +files = [ + {file = "diskcache-5.6.1-py3-none-any.whl", hash = "sha256:558c6a2d5d7c721bb00e40711803d6804850c9f76c426ed81ecc627fe9d2ce2d"}, + {file = "diskcache-5.6.1.tar.gz", hash = "sha256:e4c978532feff5814c4cc00fe1e11e40501985946643d73220d41ee7737c72c3"}, +] + [[package]] name = "distro" version = "1.8.0" description = "Distro - an OS platform information API" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -349,7 +350,6 @@ files = [ name = "docutils" version = "0.20" description = "Docutils -- Python Documentation Utilities" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -361,7 +361,6 @@ files = [ name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -374,31 +373,26 @@ test = ["pytest (>=6)"] [[package]] name = "fastapi" -version = "0.96.0" +version = "0.97.0" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" -category = "main" optional = true python-versions = ">=3.7" files = [ - {file = "fastapi-0.96.0-py3-none-any.whl", hash = "sha256:b8e11fe81e81eab4e1504209917338e0b80f783878a42c2b99467e5e1019a1e9"}, - {file = "fastapi-0.96.0.tar.gz", hash = "sha256:71232d47c2787446991c81c41c249f8a16238d52d779c0e6b43927d3773dbe3c"}, + {file = "fastapi-0.97.0-py3-none-any.whl", hash = "sha256:95d757511c596409930bd20673358d4a4d709004edb85c5d24d6ffc48fabcbf2"}, + {file = "fastapi-0.97.0.tar.gz", hash = "sha256:b53248ee45f64f19bb7600953696e3edf94b0f7de94df1e5433fc5c6136fa986"}, ] [package.dependencies] -pydantic = ">=1.6.2,<1.7 || >1.7,<1.7.1 || >1.7.1,<1.7.2 || >1.7.2,<1.7.3 || >1.7.3,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0" starlette = ">=0.27.0,<0.28.0" [package.extras] all = ["email-validator (>=1.1.1)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] -dev = ["pre-commit (>=2.17.0,<3.0.0)", "ruff (==0.0.138)", "uvicorn[standard] (>=0.12.0,<0.21.0)"] -doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-markdownextradata-plugin (>=0.1.7,<0.3.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pyyaml (>=5.3.1,<7.0.0)", "typer-cli (>=0.0.13,<0.0.14)", "typer[all] (>=0.6.1,<0.8.0)"] -test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==23.1.0)", "coverage[toml] (>=6.5.0,<8.0)", "databases[sqlite] (>=0.3.2,<0.7.0)", "email-validator (>=1.1.1,<2.0.0)", "flask (>=1.1.2,<3.0.0)", "httpx (>=0.23.0,<0.24.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.982)", "orjson (>=3.2.1,<4.0.0)", "passlib[bcrypt] (>=1.7.2,<2.0.0)", "peewee (>=3.13.3,<4.0.0)", "pytest (>=7.1.3,<8.0.0)", "python-jose[cryptography] (>=3.3.0,<4.0.0)", "python-multipart (>=0.0.5,<0.0.7)", "pyyaml (>=5.3.1,<7.0.0)", "ruff (==0.0.138)", "sqlalchemy (>=1.3.18,<1.4.43)", "types-orjson (==3.6.2)", "types-ujson (==5.7.0.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,<6.0.0)"] [[package]] name = "ghp-import" version = "2.1.0" description = "Copy your docs directly to the gh-pages branch." -category = "dev" optional = false python-versions = "*" files = [ @@ -416,7 +410,6 @@ dev = ["flake8", "markdown", "twine", "wheel"] name = "griffe" version = "0.27.3" description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -431,7 +424,6 @@ colorama = ">=0.4" name = "h11" version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -443,7 +435,6 @@ files = [ name = "httpcore" version = "0.17.0" description = "A minimal low-level HTTP client." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -455,17 +446,16 @@ files = [ anyio = ">=3.0,<5.0" certifi = "*" h11 = ">=0.13,<0.15" -sniffio = ">=1.0.0,<2.0.0" +sniffio = "==1.*" [package.extras] http2 = ["h2 (>=3,<5)"] -socks = ["socksio (>=1.0.0,<2.0.0)"] +socks = ["socksio (==1.*)"] [[package]] name = "httpx" version = "0.24.1" description = "The next generation HTTP client." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -481,15 +471,14 @@ sniffio = "*" [package.extras] brotli = ["brotli", "brotlicffi"] -cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] -socks = ["socksio (>=1.0.0,<2.0.0)"] +socks = ["socksio (==1.*)"] [[package]] name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -501,7 +490,6 @@ files = [ name = "importlib-metadata" version = "6.6.0" description = "Read metadata from Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -521,7 +509,6 @@ testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packag name = "importlib-resources" version = "5.12.0" description = "Read resources from Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -540,7 +527,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -552,7 +538,6 @@ files = [ name = "jaraco-classes" version = "3.2.3" description = "Utility functions for Python class constructs" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -571,7 +556,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "jeepney" version = "0.8.0" description = "Low-level, pure Python DBus protocol wrapper." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -587,7 +571,6 @@ trio = ["async_generator", "trio"] name = "jinja2" version = "3.1.2" description = "A very fast and expressive template engine." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -605,7 +588,6 @@ i18n = ["Babel (>=2.7)"] name = "keyring" version = "23.13.1" description = "Store and access your passwords safely." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -630,7 +612,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "markdown" version = "3.3.7" description = "Python implementation of Markdown." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -648,7 +629,6 @@ testing = ["coverage", "pyyaml"] name = "markdown-it-py" version = "2.2.0" description = "Python port of markdown-it. Markdown parsing, done right!" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -673,7 +653,6 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] name = "markupsafe" version = "2.1.2" description = "Safely add untrusted strings to HTML/XML markup." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -733,7 +712,6 @@ files = [ name = "mdurl" version = "0.1.2" description = "Markdown URL utilities" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -745,7 +723,6 @@ files = [ name = "mergedeep" version = "1.3.4" description = "A deep merge function for 🐍." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -757,7 +734,6 @@ files = [ name = "mkdocs" version = "1.4.3" description = "Project documentation with Markdown." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -786,7 +762,6 @@ min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-imp name = "mkdocs-autorefs" version = "0.4.1" description = "Automatically link across pages in MkDocs." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -802,7 +777,6 @@ mkdocs = ">=1.1" name = "mkdocs-material" version = "9.1.15" description = "Documentation that simply works" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -825,7 +799,6 @@ requests = ">=2.26" name = "mkdocs-material-extensions" version = "1.1.1" description = "Extension pack for Python Markdown and MkDocs Material." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -837,7 +810,6 @@ files = [ name = "mkdocstrings" version = "0.22.0" description = "Automatic documentation from sources, for MkDocs." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -865,7 +837,6 @@ python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] name = "mkdocstrings-python" version = "0.10.1" description = "A Python handler for mkdocstrings." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -881,7 +852,6 @@ mkdocstrings = ">=0.20" name = "more-itertools" version = "9.1.0" description = "More routines for operating on iterables, beyond itertools" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -893,7 +863,6 @@ files = [ name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -905,7 +874,6 @@ files = [ name = "numpy" version = "1.24.3" description = "Fundamental package for array computing in Python" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -943,7 +911,6 @@ files = [ name = "packaging" version = "23.1" description = "Core utilities for Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -955,7 +922,6 @@ files = [ name = "pathspec" version = "0.11.1" description = "Utility library for gitignore style pattern matching of file paths." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -967,7 +933,6 @@ files = [ name = "pkginfo" version = "1.9.6" description = "Query metadata from sdists / bdists / installed packages." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -982,7 +947,6 @@ testing = ["pytest", "pytest-cov"] name = "platformdirs" version = "3.5.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -998,7 +962,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest- name = "pluggy" version = "1.0.0" description = "plugin and hook calling mechanisms for python" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1014,7 +977,6 @@ testing = ["pytest", "pytest-benchmark"] name = "pycparser" version = "2.21" description = "C parser in Python" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1026,7 +988,6 @@ files = [ name = "pydantic" version = "1.10.7" description = "Data validation and settings management using python type hints" -category = "main" optional = true python-versions = ">=3.7" files = [ @@ -1079,7 +1040,6 @@ email = ["email-validator (>=1.0.3)"] name = "pygments" version = "2.15.1" description = "Pygments is a syntax highlighting package written in Python." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1094,7 +1054,6 @@ plugins = ["importlib-metadata"] name = "pymdown-extensions" version = "9.11" description = "Extension pack for Python Markdown." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1110,7 +1069,6 @@ pyyaml = "*" name = "pytest" version = "7.3.1" description = "pytest: simple powerful testing with Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1133,7 +1091,6 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -1148,7 +1105,6 @@ six = ">=1.5" name = "pywin32-ctypes" version = "0.2.0" description = "" -category = "dev" optional = false python-versions = "*" files = [ @@ -1160,7 +1116,6 @@ files = [ name = "pyyaml" version = "6.0" description = "YAML parser and emitter for Python" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1210,7 +1165,6 @@ files = [ name = "pyyaml-env-tag" version = "0.1" description = "A custom YAML tag for referencing environment variables in YAML files. " -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1225,7 +1179,6 @@ pyyaml = "*" name = "readme-renderer" version = "37.3" description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1245,7 +1198,6 @@ md = ["cmarkgfm (>=0.8.0)"] name = "regex" version = "2023.5.5" description = "Alternative regular expression module, to replace re." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1343,7 +1295,6 @@ files = [ name = "requests" version = "2.30.0" description = "Python HTTP for Humans." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1365,7 +1316,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "requests-toolbelt" version = "1.0.0" description = "A utility belt for advanced users of python-requests" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1380,7 +1330,6 @@ requests = ">=2.0.1,<3.0.0" name = "rfc3986" version = "2.0.0" description = "Validating URI References per RFC 3986" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1395,7 +1344,6 @@ idna2008 = ["idna"] name = "rich" version = "13.3.5" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" -category = "dev" optional = false python-versions = ">=3.7.0" files = [ @@ -1415,7 +1363,6 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] name = "scikit-build" version = "0.17.6" description = "Improved build system generator for Python C/C++/Fortran/Cython extensions" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1440,7 +1387,6 @@ test = ["build (>=0.7)", "cython (>=0.25.1)", "importlib-metadata", "pytest (>=6 name = "secretstorage" version = "3.3.3" description = "Python bindings to FreeDesktop.org Secret Service API" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1456,7 +1402,6 @@ jeepney = ">=0.6" name = "setuptools" version = "67.7.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1473,7 +1418,6 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs ( name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -1485,7 +1429,6 @@ files = [ name = "sniffio" version = "1.3.0" description = "Sniff out which async library your code is running under" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1497,7 +1440,6 @@ files = [ name = "sse-starlette" version = "1.6.1" description = "\"SSE plugin for Starlette\"" -category = "main" optional = true python-versions = ">=3.8" files = [ @@ -1512,7 +1454,6 @@ starlette = "*" name = "starlette" version = "0.27.0" description = "The little ASGI library that shines." -category = "main" optional = true python-versions = ">=3.7" files = [ @@ -1531,7 +1472,6 @@ full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyam name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1543,7 +1483,6 @@ files = [ name = "twine" version = "4.0.2" description = "Collection of utilities for publishing packages on PyPI" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1566,7 +1505,6 @@ urllib3 = ">=1.26.0" name = "typing-extensions" version = "4.6.3" description = "Backported and Experimental Type Hints for Python 3.7+" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1578,7 +1516,6 @@ files = [ name = "urllib3" version = "2.0.2" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1596,7 +1533,6 @@ zstd = ["zstandard (>=0.18.0)"] name = "uvicorn" version = "0.22.0" description = "The lightning-fast ASGI server." -category = "main" optional = true python-versions = ">=3.7" files = [ @@ -1615,7 +1551,6 @@ standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", name = "watchdog" version = "3.0.0" description = "Filesystem events monitoring" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1655,7 +1590,6 @@ watchmedo = ["PyYAML (>=3.10)"] name = "webencodings" version = "0.5.1" description = "Character encoding aliases for legacy web content" -category = "dev" optional = false python-versions = "*" files = [ @@ -1667,7 +1601,6 @@ files = [ name = "wheel" version = "0.40.0" description = "A built-package format for Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1682,7 +1615,6 @@ test = ["pytest (>=6.0.0)"] name = "zipp" version = "3.15.0" description = "Backport of pathlib-compatible object wrapper for zip files" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1695,9 +1627,9 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [extras] -server = ["uvicorn", "fastapi", "sse-starlette"] +server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "5c3354c253bc7ab7c7577a9a3733c7a341e91176e1d0c13dc2e3f3dcc0971bbe" +content-hash = "8effb1d2fa3fa9026f291dd19e8fa20f84967e63c5e3c1e0cdfcdbaa547fb586" diff --git a/pyproject.toml b/pyproject.toml index 564059c..f2dd4b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ typing-extensions = "^4.6.3" numpy = "^1.20.0" diskcache = "^5.6.1" uvicorn = { version = "^0.22.0", optional = true } -fastapi = { version = "^0.96.0", optional = true } +fastapi = { version = "^0.97.0", optional = true } sse-starlette = { version = "^1.6.1", optional = true } [tool.poetry.group.dev.dependencies] From 94f63a66b9f6f24f3e0079efa0f98c5872ef3a82 Mon Sep 17 00:00:00 2001 From: Ian Scrivener Date: Tue, 13 Jun 2023 09:49:19 +1000 Subject: [PATCH 05/27] Create macos_install.md add MacOS Metal markdown install instructions --- docs/macos_install.md | 62 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 docs/macos_install.md diff --git a/docs/macos_install.md b/docs/macos_install.md new file mode 100644 index 0000000..7d46bc4 --- /dev/null +++ b/docs/macos_install.md @@ -0,0 +1,62 @@ + +# llama-cpp-python - MacOS Install with Metal GPU + + +**(1) Make sure you have xcode installed... at least the command line parts** +``` +# check the path of your xcode install +xcode-select -p + +# xcode installed returns +# /Applications/Xcode-beta.app/Contents/Developer + +# if xcode is missing then install it... it takes ages; +xcode-select --install +``` + +**(2) Install the conda version for MacOS that supports Metal GPU** +``` +wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh +bash Miniforge3-MacOSX-arm64.sh +``` + +**(3) Make a conda environment** +``` +conda create -n llama python=3.9.16 +conda activate llama +``` + +**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU** + *(you needed xcode installed in order pip to build/compile the C++ code)* +``` +pip uninstall llama-cpp-python -y +CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir +pip install 'llama-cpp-python[server]' + +# you should now have llama-cpp-python v0.1.62 installed +llama-cpp-python         0.1.62      + +``` + +**(4) Download a v3 ggml llama/vicuna/alpaca model** + - **ggmlv3** + - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0 + +https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-q4_0.bin +https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-uncensored-q4_0.bin +https://huggingface.co/TheBloke/LLaMa-7B-GGML/blob/main/llama-7b.ggmlv3.q4_0.bin +https://huggingface.co/TheBloke/LLaMa-13B-GGML/blob/main/llama-13b.ggmlv3.q4_0.bin + + +**(6) run the llama-cpp-python API server with MacOS Metal GPU support** +``` +# config your ggml model path +# make sure it is ggml v3 +# make sure it is q4_0 +export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin +python3 -m llama_cpp.server --model $MODEL --n_gpu_layers 1 +``` + +***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used* + + From 7ca50a3e45a89fda886a3f8179b7a70fc2bda197 Mon Sep 17 00:00:00 2001 From: Ian Scrivener Date: Tue, 13 Jun 2023 09:52:22 +1000 Subject: [PATCH 06/27] Update README.md add link to main README>md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index ee6e540..a4ca04d 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,9 @@ This package provides: Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). +Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md) + + ## Installation from PyPI (recommended) Install from PyPI (requires a c compiler): From 613dd70c8a9e54c373428055102283fdd468f09b Mon Sep 17 00:00:00 2001 From: Matt Dennewitz Date: Tue, 13 Jun 2023 00:56:05 -0500 Subject: [PATCH 07/27] Update README.md Fixes typo in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ee6e540..c099cbf 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Install from PyPI (requires a c compiler): pip install llama-cpp-python ``` -The above command will attempt to install the package and build build `llama.cpp` from source. +The above command will attempt to install the package and build `llama.cpp` from source. This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different compiler options, please add the following flags to ensure that the package is rebuilt correctly: From fe41cb9043e4ca54e7a0989baae68eb5b730a0b4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 13 Jun 2023 15:07:50 +0000 Subject: [PATCH 08/27] Bump pytest from 7.3.1 to 7.3.2 Bumps [pytest](https://github.com/pytest-dev/pytest) from 7.3.1 to 7.3.2. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/7.3.1...7.3.2) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 10 +++++----- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index 1d95d76..e720acc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1067,13 +1067,13 @@ pyyaml = "*" [[package]] name = "pytest" -version = "7.3.1" +version = "7.3.2" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, - {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, + {file = "pytest-7.3.2-py3-none-any.whl", hash = "sha256:cdcbd012c9312258922f8cd3f1b62a6580fdced17db6014896053d47cddf9295"}, + {file = "pytest-7.3.2.tar.gz", hash = "sha256:ee990a3cc55ba808b80795a79944756f315c67c12b56abd3ac993a7b8c17030b"}, ] [package.dependencies] @@ -1085,7 +1085,7 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "python-dateutil" @@ -1632,4 +1632,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "8effb1d2fa3fa9026f291dd19e8fa20f84967e63c5e3c1e0cdfcdbaa547fb586" +content-hash = "1d809f04ae0543b3476915b5b767e070811908cc75032f8dc8867294cbf0055d" diff --git a/pyproject.toml b/pyproject.toml index f2dd4b7..9d1be84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.22.0"} mkdocs-material = "^9.1.15" -pytest = "^7.3.1" +pytest = "^7.3.2" httpx = "^0.24.1" scikit-build = "0.17.6" From 715f98c591e9249acc051e73b9757666e656ab57 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 21:40:13 -0400 Subject: [PATCH 09/27] Update llama.cpp --- llama_cpp/llama_cpp.py | 22 ++++++++++++++++++++++ vendor/llama.cpp | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 29136c7..be5e9c3 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -155,6 +155,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # int n_gpu_layers; // number of layers to store in VRAM # int main_gpu; // the GPU that is used for scratch and small tensors # float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs +# bool low_vram; // if true, reduce VRAM usage at the cost of performance # int seed; // RNG seed, -1 for random # bool f16_kv; // use fp16 for KV cache @@ -177,6 +178,7 @@ class llama_context_params(Structure): ("n_gpu_layers", c_int), ("main_gpu", c_int), ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), + ("low_vram", c_bool), ("seed", c_int), ("f16_kv", c_bool), ( @@ -555,6 +557,26 @@ _lib.llama_n_embd.argtypes = [llama_context_p] _lib.llama_n_embd.restype = c_int +# // Get the vocabulary as output parameters. +# // Returns number of results. +# LLAMA_API int llama_get_vocab( +# const struct llama_context * ctx, +# const char * * strings, +# float * scores, +# int capacity); +def llama_get_vocab( + ctx: llama_context_p, + strings, # type: Array[c_char_p] # type: ignore + scores, # type: Array[c_float] # type: ignore + capacity: c_int, +) -> int: + return _lib.llama_get_vocab(ctx, strings, scores, capacity) + + +_lib.llama_get_vocab.argtypes = [llama_context_p, c_char_p, c_float, c_int] +_lib.llama_get_vocab.restype = c_int + + # Token logits obtained from the last call to llama_eval() # The logits for the last token are stored in the last row # Can be mutated in order to change the probabilities of the next token diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4de0334..254a7a7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4de0334f5cabf4696eced2e5d6e279fdfaa6c0f2 +Subproject commit 254a7a7a5ff4c874ff8488f1f5cbdd7e9c89d682 From f27393ab7ed06c769aba414dcaf2d544ab0c4c35 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 21:46:48 -0400 Subject: [PATCH 10/27] Add additional verbose logs for cache --- llama_cpp/server/app.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 2191005..e248472 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -119,8 +119,12 @@ def create_app(settings: Optional[Settings] = None): ) if settings.cache: if settings.cache_type == "disk": + if settings.verbose: + print(f"Using disk cache with size {settings.cache_size}") cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) else: + if settings.verbose: + print(f"Using ram cache with size {settings.cache_size}") cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) From f7c5cfaf503eb251202f609dbbc8b5b337771de5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 22:08:28 -0400 Subject: [PATCH 11/27] Format server options --- llama_cpp/server/app.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 999d1e6..0d011f0 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -260,18 +260,18 @@ class CreateCompletionRequest(BaseModel): presence_penalty: Optional[float] = presence_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) - logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) + logprobs: Optional[int] = Field(None) # ignored or currently unsupported model: Optional[str] = model_field n: Optional[int] = 1 - logprobs: Optional[int] = Field(None) best_of: Optional[int] = 1 user: Optional[str] = Field(None) # llama.cpp specific parameters top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) class Config: schema_extra = { @@ -424,7 +424,6 @@ class CreateChatCompletionRequest(BaseModel): presence_penalty: Optional[float] = presence_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) - logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) # ignored or currently unsupported model: Optional[str] = model_field @@ -434,6 +433,7 @@ class CreateChatCompletionRequest(BaseModel): # llama.cpp specific parameters top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) class Config: schema_extra = { From 44b83cada5a9183d42a42670252b97b2ea7b37f3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 22:12:33 -0400 Subject: [PATCH 12/27] Add low_vram parameter --- llama_cpp/llama.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 46a9aeb..a6f1e76 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -219,6 +219,7 @@ class Llama: last_n_tokens_size: int = 64, lora_base: Optional[str] = None, lora_path: Optional[str] = None, + low_vram: bool = False, verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -260,6 +261,7 @@ class Llama: self.params.use_mmap = use_mmap if lora_path is None else False self.params.use_mlock = use_mlock self.params.embedding = embedding + self.params.low_vram = low_vram self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) @@ -1447,6 +1449,7 @@ class Llama: use_mmap=self.params.use_mmap, use_mlock=self.params.use_mlock, embedding=self.params.embedding, + low_vram=self.params.low_vram, last_n_tokens_size=self.last_n_tokens_size, n_batch=self.n_batch, n_threads=self.n_threads, @@ -1470,6 +1473,7 @@ class Llama: use_mmap=state["use_mmap"], use_mlock=state["use_mlock"], embedding=state["embedding"], + low_vram=state["low_vram"], n_threads=state["n_threads"], n_batch=state["n_batch"], last_n_tokens_size=state["last_n_tokens_size"], From 1e20be6d0c0ada75bbd30ae855d17569dd346b8f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 22:13:42 -0400 Subject: [PATCH 13/27] Add low_vram to server settings --- llama_cpp/server/app.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 0d011f0..313e27d 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -48,6 +48,10 @@ class Settings(BaseSettings): description="Use mmap.", ) embedding: bool = Field(default=True, description="Whether to use embeddings.") + low_vram: bool = Field( + default=False, + description="Whether to use less VRAM. This will reduce performance.", + ) last_n_tokens_size: int = Field( default=64, ge=0, From 54e2e4ffde8eac57ca3f0ad117b878837d7c3d1f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 22:15:22 -0400 Subject: [PATCH 14/27] Move metal docs to metal section of README. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c29202c..0e62f3d 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,6 @@ This package provides: Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). -Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md) - ## Installation from PyPI (recommended) @@ -73,6 +71,8 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python ``` +Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md) + ## High-level API The high-level API provides a simple managed interface through the `Llama` class. From d938e5900369d4af2dfe86e1f51cd402cb58c87c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Jun 2023 22:15:44 -0400 Subject: [PATCH 15/27] Bump version --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf6ed5d..7a01f6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- (llama.cpp) Add full gpu utilisation in CUDA +- (llama.cpp) Add get_vocab +- (llama.cpp) Add low_vram parameter +- (server) Add logit_bias parameter + ## [0.1.62] ### Fixed diff --git a/pyproject.toml b/pyproject.toml index 9d1be84..281e1bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.62" +version = "0.1.63" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index bb423d8..0449149 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.62", + version="0.1.63", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From fd9f294b3a5194844f50d3b77cc71c51b8ffbb52 Mon Sep 17 00:00:00 2001 From: imaprogrammer <46126206+nb-programmer@users.noreply.github.com> Date: Fri, 16 Jun 2023 14:11:57 +0530 Subject: [PATCH 16/27] Update llama.py: Added how many input tokens in ValueError exception --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index a6f1e76..366f050 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -814,7 +814,7 @@ class Llama: llama_cpp.llama_reset_timings(self.ctx) if len(prompt_tokens) > self._n_ctx: - raise ValueError(f"Requested tokens exceed context window of {self._n_ctx}") + raise ValueError(f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}") # Truncate max_tokens if requested tokens would exceed the context window max_tokens = ( From 37d5192a92a9e3a861027af03dab7a792436fad7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 16 Jun 2023 10:41:51 -0400 Subject: [PATCH 17/27] Update docs --- docs/macos_install.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/docs/macos_install.md b/docs/macos_install.md index 7d46bc4..33dcb5d 100644 --- a/docs/macos_install.md +++ b/docs/macos_install.md @@ -38,14 +38,11 @@ llama-cpp-python         0.1.62      ``` -**(4) Download a v3 ggml llama/vicuna/alpaca model** +**(4) Download a v3 ggml model** - **ggmlv3** - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0 -https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-q4_0.bin -https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-uncensored-q4_0.bin -https://huggingface.co/TheBloke/LLaMa-7B-GGML/blob/main/llama-7b.ggmlv3.q4_0.bin -https://huggingface.co/TheBloke/LLaMa-13B-GGML/blob/main/llama-13b.ggmlv3.q4_0.bin +https://huggingface.co/TheBloke/open-llama-7b-open-instruct-GGML **(6) run the llama-cpp-python API server with MacOS Metal GPU support** From d7153abcf820b6ad39192857a1be8b806595990d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 16 Jun 2023 23:11:14 -0400 Subject: [PATCH 18/27] Update llama.cpp --- llama_cpp/llama_cpp.py | 6 +++--- vendor/llama.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index be5e9c3..d6be0ea 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -618,7 +618,7 @@ _lib.llama_token_to_str.restype = c_char_p # Special tokens -# LLAMA_API llama_token llama_token_bos(); +# LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence def llama_token_bos() -> int: return _lib.llama_token_bos() @@ -627,7 +627,7 @@ _lib.llama_token_bos.argtypes = [] _lib.llama_token_bos.restype = llama_token -# LLAMA_API llama_token llama_token_eos(); +# LLAMA_API llama_token llama_token_eos(); // end-of-sentence def llama_token_eos() -> int: return _lib.llama_token_eos() @@ -636,7 +636,7 @@ _lib.llama_token_eos.argtypes = [] _lib.llama_token_eos.restype = llama_token -# LLAMA_API llama_token llama_token_nl(); +# LLAMA_API llama_token llama_token_nl(); // next-line def llama_token_nl() -> int: return _lib.llama_token_nl() diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 254a7a7..d411968 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 254a7a7a5ff4c874ff8488f1f5cbdd7e9c89d682 +Subproject commit d411968e990c37f51328849c96a743dd78f3c3dd From 60426b23cc6b9f715214ec09a144e477bfcb2b06 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 17 Jun 2023 13:37:14 -0400 Subject: [PATCH 19/27] Update llama.cpp --- CHANGELOG.md | 6 ++++++ vendor/llama.cpp | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a01f6d..9fba95d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- (llama.cpp) Update llama.cpp + +## [0.1.63] + +### Added + - (llama.cpp) Add full gpu utilisation in CUDA - (llama.cpp) Add get_vocab - (llama.cpp) Add low_vram parameter diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d411968..4f9c43e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d411968e990c37f51328849c96a743dd78f3c3dd +Subproject commit 4f9c43e3bd488b7561119785485e1155dba338d7 From d410f12fae32bf77a8eedc05e7bef263dc6b7cfd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 17 Jun 2023 13:38:48 -0400 Subject: [PATCH 20/27] Update docs. Closes #386 --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 366f050..a0b2030 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -228,7 +228,7 @@ class Llama: model_path: Path to the model. n_ctx: Maximum context size. n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined. - seed: Random seed. 0 for random. + seed: Random seed. -1 for random. f16_kv: Use half-precision for key/value cache. logits_all: Return logits for all tokens, not just the last token. vocab_only: Only load the vocabulary no weights. From c7d7d5b656cb63ab54c17483dec2ba36b45142f5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 17 Jun 2023 13:39:48 -0400 Subject: [PATCH 21/27] Update Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fba95d..c4cd88c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - (llama.cpp) Update llama.cpp +- Fix docs for seed. Set -1 for random. ## [0.1.63] From 44dcb5cf715cd384af85b99d13190c8d96f1f85e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 18 Jun 2023 09:37:20 -0400 Subject: [PATCH 22/27] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4f9c43e..8596af4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4f9c43e3bd488b7561119785485e1155dba338d7 +Subproject commit 8596af427722775f0df4a7c90b9af067ba90d4ef From 92b0013427be9a1fcea29a3090aa51d0fd8fb35f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 18 Jun 2023 09:48:43 -0400 Subject: [PATCH 23/27] Bump version --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c4cd88c..0060af5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.64] + ### Added - (llama.cpp) Update llama.cpp diff --git a/pyproject.toml b/pyproject.toml index 281e1bb..eb7d23b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.63" +version = "0.1.64" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 0449149..cc17564 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.63", + version="0.1.64", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From d5974a1096860e13a7dd6c123bd4557497c6b70c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 19 Jun 2023 21:07:49 +0000 Subject: [PATCH 24/27] Bump mkdocs-material from 9.1.15 to 9.1.16 Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.15 to 9.1.16. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.15...9.1.16) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index e720acc..e006449 100644 --- a/poetry.lock +++ b/poetry.lock @@ -775,13 +775,13 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.15" +version = "9.1.16" description = "Documentation that simply works" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.15-py3-none-any.whl", hash = "sha256:b49e12869ab464558e2dd3c5792da5b748a7e0c48ee83b4d05715f98125a7a39"}, - {file = "mkdocs_material-9.1.15.tar.gz", hash = "sha256:8513ab847c9a541ed3d11a3a7eed556caf72991ee786c31c5aac6691a121088a"}, + {file = "mkdocs_material-9.1.16-py3-none-any.whl", hash = "sha256:f9e62558a6b01ffac314423cbc223d970c25fbc78999860226245b64e64d6751"}, + {file = "mkdocs_material-9.1.16.tar.gz", hash = "sha256:1021bfea20f00a9423530c8c2ae9be3c78b80f5a527b3f822e6de3d872e5ab79"}, ] [package.dependencies] @@ -1632,4 +1632,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "1d809f04ae0543b3476915b5b767e070811908cc75032f8dc8867294cbf0055d" +content-hash = "fabdd2d7dba563fe7b01b4592dfb33e520b5f6e67317ce5f03205ecba396a577" diff --git a/pyproject.toml b/pyproject.toml index eb7d23b..19015b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.22.0"} -mkdocs-material = "^9.1.15" +mkdocs-material = "^9.1.16" pytest = "^7.3.2" httpx = "^0.24.1" scikit-build = "0.17.6" From e37798777e8aed908787f209396190438d724c72 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 20 Jun 2023 11:25:10 -0400 Subject: [PATCH 25/27] Update llama.cpp --- CHANGELOG.md | 4 ++++ llama_cpp/llama_cpp.py | 26 +++++++++++--------------- vendor/llama.cpp | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0060af5..a6cb99b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- (llama.cpp) Fix struct misalignment bug + ## [0.1.64] ### Added diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index d6be0ea..a516829 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -150,47 +150,43 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # struct llama_context_params { +# int seed; // RNG seed, -1 for random # int n_ctx; // text context # int n_batch; // prompt processing batch size # int n_gpu_layers; // number of layers to store in VRAM # int main_gpu; // the GPU that is used for scratch and small tensors # float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs -# bool low_vram; // if true, reduce VRAM usage at the cost of performance -# int seed; // RNG seed, -1 for random +# // called with a progress value between 0 and 1, pass NULL to disable +# llama_progress_callback progress_callback; +# // context pointer passed to the progress callback +# void * progress_callback_user_data; +# // Keep the booleans together to avoid misalignment during copy-by-value. +# bool low_vram; // if true, reduce VRAM usage at the cost of performance # bool f16_kv; // use fp16 for KV cache # bool logits_all; // the llama_eval() call computes all logits, not just the last one # bool vocab_only; // only load the vocabulary, no weights # bool use_mmap; // use mmap if possible # bool use_mlock; // force system to keep model in RAM # bool embedding; // embedding mode only - - -# // called with a progress value between 0 and 1, pass NULL to disable -# llama_progress_callback progress_callback; -# // context pointer passed to the progress callback -# void * progress_callback_user_data; # }; class llama_context_params(Structure): _fields_ = [ + ("seed", c_int), ("n_ctx", c_int), ("n_batch", c_int), ("n_gpu_layers", c_int), ("main_gpu", c_int), ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), + ("progress_callback", llama_progress_callback), + ("progress_callback_user_data", c_void_p), ("low_vram", c_bool), - ("seed", c_int), ("f16_kv", c_bool), - ( - "logits_all", - c_bool, - ), + ("logits_all", c_bool), ("vocab_only", c_bool), ("use_mmap", c_bool), ("use_mlock", c_bool), ("embedding", c_bool), - ("progress_callback", llama_progress_callback), - ("progress_callback_user_data", c_void_p), ] diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8596af4..2322ec2 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8596af427722775f0df4a7c90b9af067ba90d4ef +Subproject commit 2322ec223a21625dfe9bd73ee677444a98a24ac9 From 3e7eae479631890196823324e0573416408f52a0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 20 Jun 2023 11:25:44 -0400 Subject: [PATCH 26/27] Bump Version --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6cb99b..d5925bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.65] + ### Added - (llama.cpp) Fix struct misalignment bug diff --git a/pyproject.toml b/pyproject.toml index eb7d23b..dac026c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.64" +version = "0.1.65" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index cc17564..9f27648 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.64", + version="0.1.65", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 282698b6d383e216e129856f25b0ca41348ad525 Mon Sep 17 00:00:00 2001 From: Alexey Date: Fri, 23 Jun 2023 00:19:24 +0400 Subject: [PATCH 27/27] server: pass seed param from command line to llama --- llama_cpp/server/app.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 313e27d..ef319c7 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -30,6 +30,9 @@ class Settings(BaseSettings): ge=0, description="The number of layers to put on the GPU. The rest will be on the CPU.", ) + seed: int = Field( + default=1337, description="Random seed. -1 for random." + ) n_batch: int = Field( default=512, ge=1, description="The batch size to use per eval." ) @@ -109,6 +112,7 @@ def create_app(settings: Optional[Settings] = None): llama = llama_cpp.Llama( model_path=settings.model, n_gpu_layers=settings.n_gpu_layers, + seed=settings.seed, f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, use_mmap=settings.use_mmap,