Update llama.cpp

This commit is contained in:
Andrei Betlen 2024-02-13 12:24:00 -05:00
parent 68fb71b6a2
commit f7cdf78788
2 changed files with 19 additions and 1 deletions

View file

@ -470,6 +470,7 @@ class llama_model_params(Structure):
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
# bool embedding; // embedding mode only # bool embedding; // embedding mode only
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
# bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
# }; # };
class llama_context_params(Structure): class llama_context_params(Structure):
"""Parameters for llama_context """Parameters for llama_context
@ -496,6 +497,7 @@ class llama_context_params(Structure):
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
embedding (bool): embedding mode only embedding (bool): embedding mode only
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
""" """
_fields_ = [ _fields_ = [
@ -520,6 +522,7 @@ class llama_context_params(Structure):
("logits_all", c_bool), ("logits_all", c_bool),
("embedding", c_bool), ("embedding", c_bool),
("offload_kqv", c_bool), ("offload_kqv", c_bool),
("do_pooling", c_bool),
] ]
@ -1699,6 +1702,21 @@ _lib.llama_get_embeddings.argtypes = [llama_context_p]
_lib.llama_get_embeddings.restype = c_float_p _lib.llama_get_embeddings.restype = c_float_p
# // Get the embeddings for the ith sequence
# // llama_get_embeddings(ctx) + i*n_embd
# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
def llama_get_embeddings_ith(
ctx: llama_context_p, i: Union[c_int32, int]
): # type: (...) -> Array[float] # type: ignore
"""Get the embeddings for the ith sequence
llama_get_embeddings(ctx) + i*n_embd"""
return _lib.llama_get_embeddings_ith(ctx, i)
_lib.llama_get_embeddings_ith.argtypes = [llama_context_p, c_int32]
_lib.llama_get_embeddings_ith.restype = c_float_p
# // # //
# // Vocab # // Vocab
# // # //

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 895407f31b358e3d9335e847d13f033491ec8a5b Subproject commit ea9c8e11436ad50719987fa23a289c74b7b40d40