This commit is contained in:
commit
2264fbf750
8 changed files with 155 additions and 27 deletions
|
@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.2.44]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@4524290e87b8e107cc2b56e1251751546f4b9051
|
||||
- fix: create_embedding broken response for input type str by @abetlen in 0ce66bc080fe537590b05b24bf442480bf2dd045
|
||||
- fix: Use '\n' seperator for EventSourceResponse by @khimaros in #1188
|
||||
- fix: Incorporate embedding pooling layer fixes by @iamlemec in #1194
|
||||
|
||||
## [0.2.43]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@8084d554406b767d36b3250b3b787462d5dd626f
|
||||
|
|
16
README.md
16
README.md
|
@ -398,6 +398,22 @@ llama = Llama(
|
|||
)
|
||||
```
|
||||
|
||||
### Embeddings
|
||||
|
||||
`llama-cpp-python` supports generating embeddings from the text.
|
||||
|
||||
```python
|
||||
import llama_cpp
|
||||
|
||||
llm = llama_cpp.Llama(model_path="path/to/model.gguf", embeddings=True)
|
||||
|
||||
embeddings = llm.create_embedding("Hello, world!")
|
||||
|
||||
# or batched
|
||||
|
||||
embeddings = llm.create_embedding(["Hello, world!", "Goodbye, world!"])
|
||||
```
|
||||
|
||||
### Adjusting the Context Window
|
||||
|
||||
The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from .llama_cpp import *
|
||||
from .llama import *
|
||||
|
||||
__version__ = "0.2.43"
|
||||
__version__ = "0.2.44"
|
|
@ -98,7 +98,7 @@ class Llama:
|
|||
lora_scale: float = 1.0,
|
||||
lora_path: Optional[str] = None,
|
||||
# Backend Params
|
||||
numa: bool = False,
|
||||
numa: Union[bool, int] = False,
|
||||
# Chat Format Params
|
||||
chat_format: Optional[str] = None,
|
||||
chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
|
||||
|
@ -166,7 +166,7 @@ class Llama:
|
|||
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
||||
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
|
||||
lora_path: Path to a LoRA file to apply to the model.
|
||||
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
|
||||
numa: numa policy
|
||||
chat_format: String specifying the chat format to use when calling create_chat_completion.
|
||||
chat_handler: Optional chat handler to use when calling create_chat_completion.
|
||||
draft_model: Optional draft model to use for speculative decoding.
|
||||
|
@ -183,12 +183,20 @@ class Llama:
|
|||
|
||||
set_verbose(verbose)
|
||||
|
||||
self.numa = numa
|
||||
if not Llama.__backend_initialized:
|
||||
with suppress_stdout_stderr(disable=verbose):
|
||||
llama_cpp.llama_backend_init(self.numa)
|
||||
llama_cpp.llama_backend_init()
|
||||
Llama.__backend_initialized = True
|
||||
|
||||
if isinstance(numa, bool):
|
||||
self.numa = llama_cpp.GGML_NUMA_STRATEGY_DISTRIBUTE if numa else llama_cpp.GGML_NUMA_STRATEGY_DISABLED
|
||||
else:
|
||||
self.numa = numa
|
||||
|
||||
if self.numa != llama_cpp.GGML_NUMA_STRATEGY_DISABLED:
|
||||
with suppress_stdout_stderr(disable=verbose):
|
||||
llama_cpp.llama_numa_init(self.numa)
|
||||
|
||||
self.model_path = model_path
|
||||
|
||||
# Model Params
|
||||
|
@ -720,6 +728,8 @@ class Llama:
|
|||
assert self._model.model is not None
|
||||
model_name: str = model if model is not None else self.model_path
|
||||
|
||||
input = input if isinstance(input, list) else [input]
|
||||
|
||||
# get numeric embeddings
|
||||
embeds: List[List[float]]
|
||||
total_tokens: int
|
||||
|
@ -762,7 +772,7 @@ class Llama:
|
|||
"""
|
||||
assert self._ctx.ctx is not None
|
||||
n_embd = self.n_embd()
|
||||
n_ctx = self.n_ctx()
|
||||
n_batch = self.n_batch
|
||||
|
||||
if self.context_params.embedding == False:
|
||||
raise RuntimeError(
|
||||
|
@ -782,54 +792,55 @@ class Llama:
|
|||
|
||||
# decode and fetch embeddings
|
||||
data: List[List[float]] = []
|
||||
def decode_batch(sizes: List[int]):
|
||||
def decode_batch(n_seq: int):
|
||||
assert self._ctx.ctx is not None
|
||||
llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
|
||||
self._ctx.decode(self._batch)
|
||||
self._batch.reset()
|
||||
|
||||
# store embeddings
|
||||
for i, s in enumerate(sizes):
|
||||
embedding = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[
|
||||
for i in range(n_seq):
|
||||
embedding: List[float] = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[
|
||||
:n_embd
|
||||
]
|
||||
norm = np.linalg.norm(embedding) if normalize else s
|
||||
embedding: List[float] = [v / float(norm) for v in embedding]
|
||||
if normalize:
|
||||
norm = float(np.linalg.norm(embedding))
|
||||
embedding = [v / norm for v in embedding]
|
||||
data.append(embedding)
|
||||
|
||||
# init state
|
||||
total_tokens = 0
|
||||
t_batch = 0
|
||||
s_sizes: List[int] = []
|
||||
p_batch = 0
|
||||
|
||||
# accumulate batches and encode
|
||||
for text in inputs:
|
||||
tokens = self.tokenize(text.encode("utf-8"))
|
||||
if truncate:
|
||||
tokens = tokens[:n_ctx]
|
||||
tokens = tokens[:n_batch]
|
||||
|
||||
n_tokens = len(tokens)
|
||||
total_tokens += n_tokens
|
||||
|
||||
# check for overrun
|
||||
if n_tokens > n_ctx:
|
||||
if n_tokens > n_batch:
|
||||
raise ValueError(
|
||||
f"Requested tokens ({n_tokens}) exceed context window of {n_ctx}"
|
||||
f"Requested tokens ({n_tokens}) exceed batch size of {n_batch}"
|
||||
)
|
||||
|
||||
# time to eval batch
|
||||
if t_batch + n_tokens > self._n_ctx:
|
||||
decode_batch(s_sizes)
|
||||
if t_batch + n_tokens > n_batch:
|
||||
decode_batch(p_batch)
|
||||
t_batch = 0
|
||||
s_sizes = []
|
||||
p_batch = 0
|
||||
|
||||
# add to batch
|
||||
self._batch.add_sequence(tokens, len(s_sizes), False)
|
||||
self._batch.add_sequence(tokens, p_batch, False)
|
||||
t_batch += n_tokens
|
||||
s_sizes.append(n_tokens)
|
||||
p_batch += 1
|
||||
|
||||
# hanlde last batch
|
||||
decode_batch(s_sizes)
|
||||
decode_batch(p_batch)
|
||||
|
||||
if self.verbose:
|
||||
llama_cpp.llama_print_timings(self._ctx.ctx)
|
||||
|
|
|
@ -190,6 +190,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
|
|||
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||
|
||||
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
# };
|
||||
|
@ -215,6 +216,7 @@ LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
|
|||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_S = 24
|
||||
LLAMA_FTYPE_GUESSED = 1024
|
||||
|
||||
# enum llama_rope_scaling_type {
|
||||
|
@ -230,6 +232,15 @@ LLAMA_ROPE_SCALING_LINEAR = 1
|
|||
LLAMA_ROPE_SCALING_YARN = 2
|
||||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
|
||||
|
||||
# enum llama_pooling_type {
|
||||
# LLAMA_POOLING_NONE = 0,
|
||||
# LLAMA_POOLING_MEAN = 1,
|
||||
# LLAMA_POOLING_CLS = 2,
|
||||
# };
|
||||
LLAMA_POOLING_NONE = 0
|
||||
LLAMA_POOLING_MEAN = 1
|
||||
LLAMA_POOLING_CLS = 2
|
||||
|
||||
# enum llama_split_mode {
|
||||
# LLAMA_SPLIT_NONE = 0, // single GPU
|
||||
# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||
|
@ -653,6 +664,18 @@ class llama_timings(Structure):
|
|||
]
|
||||
|
||||
|
||||
# // used in chat template
|
||||
# typedef struct llama_chat_message {
|
||||
# const char * role;
|
||||
# const char * content;
|
||||
# } llama_chat_message;
|
||||
class llama_chat_message(Structure):
|
||||
_fields_ = [
|
||||
("role", c_char_p),
|
||||
("content", c_char_p),
|
||||
]
|
||||
|
||||
|
||||
# // Helpers for getting default parameters
|
||||
# LLAMA_API struct llama_model_params llama_model_default_params(void);
|
||||
def llama_model_default_params() -> llama_model_params:
|
||||
|
@ -688,17 +711,45 @@ _lib.llama_model_quantize_default_params.restype = llama_model_quantize_params
|
|||
# // If numa is true, use NUMA optimizations
|
||||
# // Call once at the start of the program
|
||||
# LLAMA_API void llama_backend_init(bool numa);
|
||||
def llama_backend_init(numa: Union[c_bool, bool]):
|
||||
# LLAMA_API void llama_backend_init(void);
|
||||
def llama_backend_init():
|
||||
"""Initialize the llama + ggml backend
|
||||
If numa is true, use NUMA optimizations
|
||||
Call once at the start of the program"""
|
||||
return _lib.llama_backend_init(numa)
|
||||
return _lib.llama_backend_init()
|
||||
|
||||
|
||||
_lib.llama_backend_init.argtypes = [c_bool]
|
||||
_lib.llama_backend_init.argtypes = []
|
||||
_lib.llama_backend_init.restype = None
|
||||
|
||||
|
||||
# // numa strategies
|
||||
# enum ggml_numa_strategy {
|
||||
# GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||
# GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||
# GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||
# GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||
# GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||
# GGML_NUMA_STRATEGY_COUNT
|
||||
# };
|
||||
GGML_NUMA_STRATEGY_DISABLED = 0
|
||||
GGML_NUMA_STRATEGY_DISTRIBUTE = 1
|
||||
GGML_NUMA_STRATEGY_ISOLATE = 2
|
||||
GGML_NUMA_STRATEGY_NUMACTL = 3
|
||||
GGML_NUMA_STRATEGY_MIRROR = 4
|
||||
GGML_NUMA_STRATEGY_COUNT = 5
|
||||
|
||||
|
||||
# //optional:
|
||||
# LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
||||
def llama_numa_init(numa: int):
|
||||
return _lib.llama_numa_init(numa)
|
||||
|
||||
|
||||
_lib.llama_numa_init.argtypes = [c_int]
|
||||
_lib.llama_numa_init.restype = None
|
||||
|
||||
|
||||
# // Call once at the end of the program - currently only used for MPI
|
||||
# LLAMA_API void llama_backend_free(void);
|
||||
def llama_backend_free():
|
||||
|
@ -1917,6 +1968,47 @@ _lib.llama_token_to_piece.argtypes = [llama_model_p, llama_token, c_char_p, c_in
|
|||
_lib.llama_token_to_piece.restype = c_int32
|
||||
|
||||
|
||||
# /// Apply chat template. Inspired by hf apply_chat_template() on python.
|
||||
# /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
||||
# /// NOTE: This function only support some known jinja templates. It is not a jinja parser.
|
||||
# /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
||||
# /// @param chat Pointer to a list of multiple llama_chat_message
|
||||
# /// @param n_msg Number of llama_chat_message in this chat
|
||||
# /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
||||
# /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
|
||||
# /// @param length The size of the allocated buffer
|
||||
# /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
||||
# LLAMA_API int32_t llama_chat_apply_template(
|
||||
# const struct llama_model * model,
|
||||
# const char * tmpl,
|
||||
# const struct llama_chat_message * chat,
|
||||
# size_t n_msg,
|
||||
# bool add_ass,
|
||||
# char * buf,
|
||||
# int32_t length);
|
||||
def llama_chat_apply_template(
|
||||
model: llama_model_p,
|
||||
tmpl: bytes,
|
||||
chat: "ctypes._Pointer[llama_chat_message]",
|
||||
n_msg: int,
|
||||
) -> int:
|
||||
return _lib.llama_chat_apply_template(
|
||||
model,
|
||||
tmpl,
|
||||
chat,
|
||||
n_msg
|
||||
)
|
||||
|
||||
_lib.llama_chat_apply_template.argtypes = [
|
||||
ctypes.c_void_p,
|
||||
ctypes.c_char_p,
|
||||
ctypes.POINTER(llama_chat_message),
|
||||
ctypes.c_size_t
|
||||
]
|
||||
_lib.llama_chat_apply_template.restype = ctypes.c_int32
|
||||
|
||||
|
||||
|
||||
# //
|
||||
# // Grammar
|
||||
# //
|
||||
|
|
|
@ -290,6 +290,7 @@ async def create_completion(
|
|||
inner_send_chan=send_chan,
|
||||
iterator=iterator(),
|
||||
),
|
||||
sep='\n',
|
||||
)
|
||||
else:
|
||||
return iterator_or_completion
|
||||
|
@ -382,6 +383,7 @@ async def create_chat_completion(
|
|||
inner_send_chan=send_chan,
|
||||
iterator=iterator(),
|
||||
),
|
||||
sep='\n',
|
||||
)
|
||||
else:
|
||||
return iterator_or_completion
|
||||
|
|
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
|||
|
||||
import multiprocessing
|
||||
|
||||
from typing import Optional, List, Literal
|
||||
from typing import Optional, List, Literal, Union
|
||||
from pydantic import Field
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
|
@ -108,7 +108,7 @@ class ModelSettings(BaseSettings):
|
|||
description="Path to a LoRA file to apply to the model.",
|
||||
)
|
||||
# Backend Params
|
||||
numa: bool = Field(
|
||||
numa: Union[bool, int] = Field(
|
||||
default=False,
|
||||
description="Enable NUMA support.",
|
||||
)
|
||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 8084d554406b767d36b3250b3b787462d5dd626f
|
||||
Subproject commit f53119cec4f073b6d214195ecbe1fad3abdf2b34
|
Loading…
Reference in a new issue