This commit is contained in:
commit
c39debbb1e
6 changed files with 32 additions and 10 deletions
|
@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.2.33]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@faa3526a1eba458120987ed8269e5616385a76f4
|
||||||
|
- feat(server): include llama-cpp-python version in openapi spec by @abetlen in cde7514c3d28e6d52f272614e9957208c344dde5
|
||||||
|
- fix: use both eos and bos tokens as stop sequences for hf-tokenizer-config chat format. by @abetlen in 5b982d0f8c6f35242c8862ffdce00e17cea0b44f
|
||||||
|
- fix: GGUF metadata KV overrides, re #1011 by @phiharri in #1116
|
||||||
|
- fix: llama_log_set should be able to accept null pointer by @abetlen in c970d41a85381fd55235136f123422df0bf0c7e7
|
||||||
|
|
||||||
## [0.2.32]
|
## [0.2.32]
|
||||||
|
|
||||||
- feat: Update llama.cpp to ggerganov/llama.cpp@504dc37be8446fb09b1ede70300250ad41be32a2
|
- feat: Update llama.cpp to ggerganov/llama.cpp@504dc37be8446fb09b1ede70300250ad41be32a2
|
||||||
|
|
24
README.md
24
README.md
|
@ -104,6 +104,7 @@ CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
|
||||||
### Windows Notes
|
### Windows Notes
|
||||||
|
|
||||||
If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install:
|
If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install:
|
||||||
|
|
||||||
```ps
|
```ps
|
||||||
$env:CMAKE_GENERATOR = "MinGW Makefiles"
|
$env:CMAKE_GENERATOR = "MinGW Makefiles"
|
||||||
$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe"
|
$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe"
|
||||||
|
@ -118,17 +119,19 @@ Detailed MacOS Metal GPU install documentation is available at [docs/install/mac
|
||||||
#### M1 Mac Performance Issue
|
#### M1 Mac Performance Issue
|
||||||
|
|
||||||
Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example:
|
Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example:
|
||||||
```
|
|
||||||
|
```bash
|
||||||
wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
|
wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
|
||||||
bash Miniforge3-MacOSX-arm64.sh
|
bash Miniforge3-MacOSX-arm64.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
Otherwise, while installing it will build the llama.cpp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
|
Otherwise, while installing it will build the llama.cpp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
|
||||||
|
|
||||||
#### M Series Mac Error: `(mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))`
|
#### M Series Mac Error: `(mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))`
|
||||||
|
|
||||||
Try installing with
|
Try installing with
|
||||||
|
|
||||||
```
|
```bash
|
||||||
CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DLLAMA_METAL=on" pip install --upgrade --verbose --force-reinstall --no-cache-dir llama-cpp-python
|
CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DLLAMA_METAL=on" pip install --upgrade --verbose --force-reinstall --no-cache-dir llama-cpp-python
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -152,10 +155,15 @@ Below is a short example demonstrating how to use the high-level API to for basi
|
||||||
|
|
||||||
```python
|
```python
|
||||||
>>> from llama_cpp import Llama
|
>>> from llama_cpp import Llama
|
||||||
>>> llm = Llama(model_path="./models/7B/llama-model.gguf")
|
>>> llm = Llama(
|
||||||
|
model_path="./models/7B/llama-model.gguf",
|
||||||
|
# n_gpu_layers=-1, # Uncomment to use GPU acceleration
|
||||||
|
# seed=1337, # Uncomment to set a specific seed
|
||||||
|
# n_ctx=2048, # Uncomment to increase the context window
|
||||||
|
)
|
||||||
>>> output = llm(
|
>>> output = llm(
|
||||||
"Q: Name the planets in the solar system? A: ", # Prompt
|
"Q: Name the planets in the solar system? A: ", # Prompt
|
||||||
max_tokens=32, # Generate up to 32 tokens
|
max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
|
||||||
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
|
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
|
||||||
echo=True # Echo the prompt back in the output
|
echo=True # Echo the prompt back in the output
|
||||||
) # Generate a completion, can also call create_completion
|
) # Generate a completion, can also call create_completion
|
||||||
|
@ -191,7 +199,10 @@ Note that `chat_format` option must be set for the particular model you are usin
|
||||||
|
|
||||||
```python
|
```python
|
||||||
>>> from llama_cpp import Llama
|
>>> from llama_cpp import Llama
|
||||||
>>> llm = Llama(model_path="path/to/llama-2/llama-model.gguf", chat_format="llama-2")
|
>>> llm = Llama(
|
||||||
|
model_path="path/to/llama-2/llama-model.gguf",
|
||||||
|
chat_format="llama-2"
|
||||||
|
)
|
||||||
>>> llm.create_chat_completion(
|
>>> llm.create_chat_completion(
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "system", "content": "You are an assistant who perfectly describes images."},
|
{"role": "system", "content": "You are an assistant who perfectly describes images."},
|
||||||
|
@ -414,6 +425,9 @@ pip install -e .[all]
|
||||||
make clean
|
make clean
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can also test out specific commits of `lama.cpp` by checking out the desired commit in the `vendor/llama.cpp` submodule and then running `make clean` and `pip install -e .` again. Any changes in the `llama.h` API will require
|
||||||
|
changes to the `llama_cpp/llama_cpp.py` file to match the new API (additional changes may be required elsewhere).
|
||||||
|
|
||||||
## FAQ
|
## FAQ
|
||||||
|
|
||||||
### Are there pre-built binaries / binary wheels available?
|
### Are there pre-built binaries / binary wheels available?
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .llama_cpp import *
|
from .llama_cpp import *
|
||||||
from .llama import *
|
from .llama import *
|
||||||
|
|
||||||
__version__ = "0.2.32"
|
__version__ = "0.2.33"
|
|
@ -2528,7 +2528,7 @@ _lib.llama_print_system_info.restype = c_char_p
|
||||||
# // If this is not called, or NULL is supplied, everything is output on stderr.
|
# // If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
# LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
# LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
||||||
def llama_log_set(
|
def llama_log_set(
|
||||||
log_callback: "ctypes._FuncPointer", user_data: c_void_p # type: ignore
|
log_callback: Union["ctypes._FuncPointer", c_void_p], user_data: c_void_p # type: ignore
|
||||||
):
|
):
|
||||||
"""Set callback for all future logging events.
|
"""Set callback for all future logging events.
|
||||||
|
|
||||||
|
@ -2536,7 +2536,7 @@ def llama_log_set(
|
||||||
return _lib.llama_log_set(log_callback, user_data)
|
return _lib.llama_log_set(log_callback, user_data)
|
||||||
|
|
||||||
|
|
||||||
_lib.llama_log_set.argtypes = [llama_log_callback, c_void_p]
|
_lib.llama_log_set.argtypes = [ctypes.c_void_p, c_void_p]
|
||||||
_lib.llama_log_set.restype = None
|
_lib.llama_log_set.restype = None
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -118,7 +118,7 @@ def create_app(
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
middleware=middleware,
|
middleware=middleware,
|
||||||
title="🦙 llama.cpp Python API",
|
title="🦙 llama.cpp Python API",
|
||||||
version="0.0.1",
|
version=llama_cpp.__version__,
|
||||||
)
|
)
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
CORSMiddleware,
|
CORSMiddleware,
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 26d607608d794efa56df3bdb6043a2f94c1d632c
|
Subproject commit faa3526a1eba458120987ed8269e5616385a76f4
|
Loading…
Reference in a new issue