diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 6d24324..1116b1f 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -468,11 +468,13 @@ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN # LLAMA_POOLING_TYPE_NONE = 0, # LLAMA_POOLING_TYPE_MEAN = 1, # LLAMA_POOLING_TYPE_CLS = 2, +# LLAMA_POOLING_TYPE_LAST = 3, # }; LLAMA_POOLING_TYPE_UNSPECIFIED = -1 LLAMA_POOLING_TYPE_NONE = 0 LLAMA_POOLING_TYPE_MEAN = 1 LLAMA_POOLING_TYPE_CLS = 2 +LLAMA_POOLING_TYPE_LAST = 3 # enum llama_split_mode { # LLAMA_SPLIT_MODE_NONE = 0, // single GPU @@ -761,7 +763,6 @@ class llama_model_params(ctypes.Structure): # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id -# // (ignored if no pooling layer) # // ref: https://github.com/ggerganov/llama.cpp/pull/2054 # float rope_freq_base; // RoPE base frequency, 0 = from model @@ -2316,6 +2317,16 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int: ... +# // Set whether the model is in embeddings model or not +# // If true, embeddings will be returned but logits will not +# LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings); +@ctypes_function("llama_set_embeddings", [llama_context_p_ctypes, ctypes.c_bool], None) +def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /): + """Set whether the model is in embeddings model or not + If true, embeddings will be returned but logits will not""" + ... + + # // Set whether to use causal attention or not # // If set to true, the model will only attend to the past tokens # LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);