This commit is contained in:
baalajimaestro 2024-01-26 12:21:14 +05:30
commit c39debbb1e
Signed by: baalajimaestro
GPG key ID: F93C394FE9BBAFD5
6 changed files with 32 additions and 10 deletions

View file

@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
## [0.2.33]
- feat: Update llama.cpp to ggerganov/llama.cpp@faa3526a1eba458120987ed8269e5616385a76f4
- feat(server): include llama-cpp-python version in openapi spec by @abetlen in cde7514c3d28e6d52f272614e9957208c344dde5
- fix: use both eos and bos tokens as stop sequences for hf-tokenizer-config chat format. by @abetlen in 5b982d0f8c6f35242c8862ffdce00e17cea0b44f
- fix: GGUF metadata KV overrides, re #1011 by @phiharri in #1116
- fix: llama_log_set should be able to accept null pointer by @abetlen in c970d41a85381fd55235136f123422df0bf0c7e7
## [0.2.32] ## [0.2.32]
- feat: Update llama.cpp to ggerganov/llama.cpp@504dc37be8446fb09b1ede70300250ad41be32a2 - feat: Update llama.cpp to ggerganov/llama.cpp@504dc37be8446fb09b1ede70300250ad41be32a2

View file

@ -104,6 +104,7 @@ CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
### Windows Notes ### Windows Notes
If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install: If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install:
```ps ```ps
$env:CMAKE_GENERATOR = "MinGW Makefiles" $env:CMAKE_GENERATOR = "MinGW Makefiles"
$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe" $env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe"
@ -118,17 +119,19 @@ Detailed MacOS Metal GPU install documentation is available at [docs/install/mac
#### M1 Mac Performance Issue #### M1 Mac Performance Issue
Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example:
```
```bash
wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
bash Miniforge3-MacOSX-arm64.sh bash Miniforge3-MacOSX-arm64.sh
``` ```
Otherwise, while installing it will build the llama.cpp x86 version which will be 10x slower on Apple Silicon (M1) Mac. Otherwise, while installing it will build the llama.cpp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
#### M Series Mac Error: `(mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))` #### M Series Mac Error: `(mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))`
Try installing with Try installing with
``` ```bash
CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DLLAMA_METAL=on" pip install --upgrade --verbose --force-reinstall --no-cache-dir llama-cpp-python CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DLLAMA_METAL=on" pip install --upgrade --verbose --force-reinstall --no-cache-dir llama-cpp-python
``` ```
@ -152,10 +155,15 @@ Below is a short example demonstrating how to use the high-level API to for basi
```python ```python
>>> from llama_cpp import Llama >>> from llama_cpp import Llama
>>> llm = Llama(model_path="./models/7B/llama-model.gguf") >>> llm = Llama(
model_path="./models/7B/llama-model.gguf",
# n_gpu_layers=-1, # Uncomment to use GPU acceleration
# seed=1337, # Uncomment to set a specific seed
# n_ctx=2048, # Uncomment to increase the context window
)
>>> output = llm( >>> output = llm(
"Q: Name the planets in the solar system? A: ", # Prompt "Q: Name the planets in the solar system? A: ", # Prompt
max_tokens=32, # Generate up to 32 tokens max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
echo=True # Echo the prompt back in the output echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion ) # Generate a completion, can also call create_completion
@ -191,7 +199,10 @@ Note that `chat_format` option must be set for the particular model you are usin
```python ```python
>>> from llama_cpp import Llama >>> from llama_cpp import Llama
>>> llm = Llama(model_path="path/to/llama-2/llama-model.gguf", chat_format="llama-2") >>> llm = Llama(
model_path="path/to/llama-2/llama-model.gguf",
chat_format="llama-2"
)
>>> llm.create_chat_completion( >>> llm.create_chat_completion(
messages = [ messages = [
{"role": "system", "content": "You are an assistant who perfectly describes images."}, {"role": "system", "content": "You are an assistant who perfectly describes images."},
@ -414,6 +425,9 @@ pip install -e .[all]
make clean make clean
``` ```
You can also test out specific commits of `lama.cpp` by checking out the desired commit in the `vendor/llama.cpp` submodule and then running `make clean` and `pip install -e .` again. Any changes in the `llama.h` API will require
changes to the `llama_cpp/llama_cpp.py` file to match the new API (additional changes may be required elsewhere).
## FAQ ## FAQ
### Are there pre-built binaries / binary wheels available? ### Are there pre-built binaries / binary wheels available?

View file

@ -1,4 +1,4 @@
from .llama_cpp import * from .llama_cpp import *
from .llama import * from .llama import *
__version__ = "0.2.32" __version__ = "0.2.33"

View file

@ -2528,7 +2528,7 @@ _lib.llama_print_system_info.restype = c_char_p
# // If this is not called, or NULL is supplied, everything is output on stderr. # // If this is not called, or NULL is supplied, everything is output on stderr.
# LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); # LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
def llama_log_set( def llama_log_set(
log_callback: "ctypes._FuncPointer", user_data: c_void_p # type: ignore log_callback: Union["ctypes._FuncPointer", c_void_p], user_data: c_void_p # type: ignore
): ):
"""Set callback for all future logging events. """Set callback for all future logging events.
@ -2536,7 +2536,7 @@ def llama_log_set(
return _lib.llama_log_set(log_callback, user_data) return _lib.llama_log_set(log_callback, user_data)
_lib.llama_log_set.argtypes = [llama_log_callback, c_void_p] _lib.llama_log_set.argtypes = [ctypes.c_void_p, c_void_p]
_lib.llama_log_set.restype = None _lib.llama_log_set.restype = None

View file

@ -118,7 +118,7 @@ def create_app(
app = FastAPI( app = FastAPI(
middleware=middleware, middleware=middleware,
title="🦙 llama.cpp Python API", title="🦙 llama.cpp Python API",
version="0.0.1", version=llama_cpp.__version__,
) )
app.add_middleware( app.add_middleware(
CORSMiddleware, CORSMiddleware,

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 26d607608d794efa56df3bdb6043a2f94c1d632c Subproject commit faa3526a1eba458120987ed8269e5616385a76f4