bug fixing (#925)
This commit is contained in:
parent
f3117c0cf6
commit
6dde6bd09c
2 changed files with 86 additions and 8 deletions
|
@ -11,20 +11,34 @@ MODEL_PATH = os.environ.get('MODEL', "../models/7B/ggml-model.bin")
|
||||||
|
|
||||||
prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
|
prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
|
||||||
|
|
||||||
lparams = llama_cpp.llama_context_default_params()
|
lparams = llama_cpp.llama_model_default_params()
|
||||||
|
cparams = llama_cpp.llama_context_default_params()
|
||||||
model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode('utf-8'), lparams)
|
model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode('utf-8'), lparams)
|
||||||
ctx = llama_cpp.llama_new_context_with_model(model, lparams)
|
ctx = llama_cpp.llama_new_context_with_model(model, cparams)
|
||||||
|
|
||||||
# determine the required inference memory per token:
|
# determine the required inference memory per token:
|
||||||
tmp = [0, 1, 2, 3]
|
tmp = [0, 1, 2, 3]
|
||||||
llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS)
|
llama_cpp.llama_eval(
|
||||||
|
ctx = ctx,
|
||||||
|
tokens=(llama_cpp.c_int * len(tmp))(*tmp),
|
||||||
|
n_tokens=len(tmp),
|
||||||
|
n_past=0
|
||||||
|
)# Deprecated
|
||||||
|
|
||||||
n_past = 0
|
n_past = 0
|
||||||
|
|
||||||
prompt = b" " + prompt
|
prompt = b" " + prompt
|
||||||
|
|
||||||
embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
|
embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
|
||||||
n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True)
|
n_of_tok = llama_cpp.llama_tokenize(
|
||||||
|
model=model,
|
||||||
|
text=bytes(str(prompt),'utf-8'),
|
||||||
|
text_len=len(embd_inp),
|
||||||
|
tokens=embd_inp,
|
||||||
|
n_max_tokens=len(embd_inp),
|
||||||
|
add_bos=False,
|
||||||
|
special=False
|
||||||
|
)
|
||||||
embd_inp = embd_inp[:n_of_tok]
|
embd_inp = embd_inp[:n_of_tok]
|
||||||
|
|
||||||
n_ctx = llama_cpp.llama_n_ctx(ctx)
|
n_ctx = llama_cpp.llama_n_ctx(ctx)
|
||||||
|
@ -49,8 +63,11 @@ presence_penalty = 0.0
|
||||||
while remaining_tokens > 0:
|
while remaining_tokens > 0:
|
||||||
if len(embd) > 0:
|
if len(embd) > 0:
|
||||||
llama_cpp.llama_eval(
|
llama_cpp.llama_eval(
|
||||||
ctx, (llama_cpp.c_int * len(embd))(*embd), len(embd), n_past, N_THREADS
|
ctx = ctx,
|
||||||
)
|
tokens=(llama_cpp.c_int * len(embd))(*embd),
|
||||||
|
n_tokens=len(embd),
|
||||||
|
n_past=n_past
|
||||||
|
)# Deprecated
|
||||||
|
|
||||||
n_past += len(embd)
|
n_past += len(embd)
|
||||||
embd = []
|
embd = []
|
||||||
|
@ -93,7 +110,7 @@ while remaining_tokens > 0:
|
||||||
for id in embd:
|
for id in embd:
|
||||||
size = 32
|
size = 32
|
||||||
buffer = (ctypes.c_char * size)()
|
buffer = (ctypes.c_char * size)()
|
||||||
n = llama_cpp.llama_token_to_piece_with_model(
|
n = llama_cpp.llama_token_to_piece(
|
||||||
model, llama_cpp.llama_token(id), buffer, size)
|
model, llama_cpp.llama_token(id), buffer, size)
|
||||||
assert n <= size
|
assert n <= size
|
||||||
print(
|
print(
|
||||||
|
@ -109,4 +126,4 @@ print()
|
||||||
|
|
||||||
llama_cpp.llama_print_timings(ctx)
|
llama_cpp.llama_print_timings(ctx)
|
||||||
|
|
||||||
llama_cpp.llama_free(ctx)
|
llama_cpp.llama_free(ctx)
|
61
examples/low_level_api/readme/low_level_api_llama_cpp.md
Normal file
61
examples/low_level_api/readme/low_level_api_llama_cpp.md
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
# Low-Level API for Llama_cpp
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
This Python script, low_level_api_llama_cpp.py, demonstrates the implementation of a low-level API for interacting with the llama_cpp library. The script defines an inference that generates embeddings based on a given prompt using .gguf model.
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
Before running the script, ensure that you have the following dependencies installed:
|
||||||
|
|
||||||
|
. Python 3.6 or higher
|
||||||
|
. llama_cpp: A C++ library for working with .gguf model
|
||||||
|
. NumPy: A fundamental package for scientific computing with Python
|
||||||
|
. multiprocessing: A Python module for parallel computing
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
install depedencies:
|
||||||
|
```bash
|
||||||
|
python -m pip install llama-cpp-python ctypes os multiprocessing
|
||||||
|
```
|
||||||
|
Run the script:
|
||||||
|
```bash
|
||||||
|
python low_level_api_llama_cpp.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Code Structure
|
||||||
|
The script is organized as follows:
|
||||||
|
|
||||||
|
### . Initialization:
|
||||||
|
Load the model from the specified path.
|
||||||
|
Create a context for model evaluation.
|
||||||
|
|
||||||
|
### . Tokenization:
|
||||||
|
Tokenize the input prompt using the llama_tokenize function.
|
||||||
|
Prepare the input tokens for model evaluation.
|
||||||
|
|
||||||
|
### . Inference:
|
||||||
|
Perform model evaluation to generate responses.
|
||||||
|
Sample from the model's output using various strategies (top-k, top-p, temperature).
|
||||||
|
|
||||||
|
### . Output:
|
||||||
|
Print the generated tokens and the corresponding decoded text.
|
||||||
|
|
||||||
|
### .Cleanup:
|
||||||
|
Free resources and print timing information.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
Customize the inference behavior by adjusting the following variables:
|
||||||
|
|
||||||
|
#### . N_THREADS: Number of CPU threads to use for model evaluation.
|
||||||
|
#### . MODEL_PATH: Path to the model file.
|
||||||
|
#### . prompt: Input prompt for the chatbot.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
. Ensure that the llama_cpp library is built and available in the system. Follow the instructions in the llama_cpp repository for building and installing the library.
|
||||||
|
|
||||||
|
. This script is designed to work with the .gguf model and may require modifications for compatibility with other models.
|
||||||
|
|
||||||
|
## Acknowledgments
|
||||||
|
This code is based on the llama_cpp library developed by the community. Special thanks to the contributors for their efforts.
|
||||||
|
|
||||||
|
## License
|
||||||
|
This project is licensed under the MIT License - see the LICENSE file for details.
|
Loading…
Reference in a new issue