From 087cc0b036e6a52fda0df6fb4575e908fa0e32fd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 24 May 2024 01:43:36 -0400 Subject: [PATCH] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 16 ++++++++++++++++ vendor/llama.cpp | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 88fa071..4284019 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2265,6 +2265,22 @@ def llama_set_n_threads( ... +# // Get the number of threads used for generation of a single token. +# LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx); +@ctypes_function("llama_n_threads", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_threads(ctx: llama_context_p, /) -> int: + """Get the number of threads used for generation of a single token""" + ... + + +# // Get the number of threads used for prompt and batch processing (multiple token). +# LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); +@ctypes_function("llama_n_threads_batch", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_threads_batch(ctx: llama_context_p, /) -> int: + """Get the number of threads used for prompt and batch processing (multiple token)""" + ... + + # // Set whether to use causal attention or not # // If set to true, the model will only attend to the past tokens # LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 201cc11..0df0aa8 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 201cc11afa0a1950e1f632390b2ac6c937a0d8f0 +Subproject commit 0df0aa8e43c3378975269a51f9b876c8692e70da