From dfe8608096cf0115e2e198821ca0c86e57b6a09a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 24 Mar 2023 19:10:31 -0400 Subject: [PATCH] Update examples --- examples/fastapi_server.py | 2 + examples/low_level_api_inference.py | 49 ------------------- ...cpp_main.py => low_level_api_llama_cpp.py} | 0 3 files changed, 2 insertions(+), 49 deletions(-) delete mode 100644 examples/low_level_api_inference.py rename examples/{llama_cpp_main.py => low_level_api_llama_cpp.py} (100%) diff --git a/examples/fastapi_server.py b/examples/fastapi_server.py index 93b6edb..46a7b8a 100644 --- a/examples/fastapi_server.py +++ b/examples/fastapi_server.py @@ -1,3 +1,5 @@ +"""Example FastAPI server for llama.cpp. +""" from typing import List, Optional from llama_cpp import Llama diff --git a/examples/low_level_api_inference.py b/examples/low_level_api_inference.py deleted file mode 100644 index 0031f2e..0000000 --- a/examples/low_level_api_inference.py +++ /dev/null @@ -1,49 +0,0 @@ -""" Example of low-level API inference based on https://github.com/ggerganov/llama.cpp/issues/384#issuecomment-1480129622 -""" -import multiprocessing - -import llama_cpp - -N_THREADS = multiprocessing.cpu_count() - -prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n" - -lparams = llama_cpp.llama_context_default_params() - -ctx = llama_cpp.llama_init_from_file(b"models/ggml-alpaca-7b-q4.bin", lparams) -tmp = [0, 1, 2, 3] -llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS) - -embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))() -n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True) -embd_inp = embd_inp[:n_of_tok] - -for i in range(len(embd_inp)): - llama_cpp.llama_eval(ctx, (llama_cpp.c_int * 1)(embd_inp[i]), 1, i, N_THREADS) - -prediction = b"" -embd = embd_inp - -n = 8 - -for i in range(n): - id = llama_cpp.llama_sample_top_p_top_k( - ctx, - (llama_cpp.c_int * len(embd))(*embd), - n_of_tok + i, - 40, - 0.8, - 0.2, - 1.0 / 0.85, - ) - - embd.append(id) - - prediction += llama_cpp.llama_token_to_str(ctx, id) - - llama_cpp.llama_eval(ctx, (llama_cpp.c_int * 1)(embd[-1]), 1, len(embd), N_THREADS) - - -llama_cpp.llama_free(ctx) - -print(prediction.decode("utf-8")) diff --git a/examples/llama_cpp_main.py b/examples/low_level_api_llama_cpp.py similarity index 100% rename from examples/llama_cpp_main.py rename to examples/low_level_api_llama_cpp.py