feat: Update llama_cpp.py bindings

This commit is contained in:
Andrei Betlen 2024-06-21 16:56:15 -04:00
parent 35c980eb2e
commit 04959f1884

View file

@ -468,11 +468,13 @@ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
# LLAMA_POOLING_TYPE_NONE = 0,
# LLAMA_POOLING_TYPE_MEAN = 1,
# LLAMA_POOLING_TYPE_CLS = 2,
# LLAMA_POOLING_TYPE_LAST = 3,
# };
LLAMA_POOLING_TYPE_UNSPECIFIED = -1
LLAMA_POOLING_TYPE_NONE = 0
LLAMA_POOLING_TYPE_MEAN = 1
LLAMA_POOLING_TYPE_CLS = 2
LLAMA_POOLING_TYPE_LAST = 3
# enum llama_split_mode {
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
@ -761,7 +763,6 @@ class llama_model_params(ctypes.Structure):
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
# // (ignored if no pooling layer)
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
# float rope_freq_base; // RoPE base frequency, 0 = from model
@ -2316,6 +2317,16 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
...
# // Set whether the model is in embeddings model or not
# // If true, embeddings will be returned but logits will not
# LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
@ctypes_function("llama_set_embeddings", [llama_context_p_ctypes, ctypes.c_bool], None)
def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /):
"""Set whether the model is in embeddings model or not
If true, embeddings will be returned but logits will not"""
...
# // Set whether to use causal attention or not
# // If set to true, the model will only attend to the past tokens
# LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);