Compare commits

..

No commits in common. "2264fbf750b049626903a03329303656c64db883" and "21ac214a38996886195b3d24d9a9e6dfa58ce238" have entirely different histories.

8 changed files with 27 additions and 155 deletions

View file

@ -7,13 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
## [0.2.44]
- feat: Update llama.cpp to ggerganov/llama.cpp@4524290e87b8e107cc2b56e1251751546f4b9051
- fix: create_embedding broken response for input type str by @abetlen in 0ce66bc080fe537590b05b24bf442480bf2dd045
- fix: Use '\n' seperator for EventSourceResponse by @khimaros in #1188
- fix: Incorporate embedding pooling layer fixes by @iamlemec in #1194
## [0.2.43]
- feat: Update llama.cpp to ggerganov/llama.cpp@8084d554406b767d36b3250b3b787462d5dd626f

View file

@ -398,22 +398,6 @@ llama = Llama(
)
```
### Embeddings
`llama-cpp-python` supports generating embeddings from the text.
```python
import llama_cpp
llm = llama_cpp.Llama(model_path="path/to/model.gguf", embeddings=True)
embeddings = llm.create_embedding("Hello, world!")
# or batched
embeddings = llm.create_embedding(["Hello, world!", "Goodbye, world!"])
```
### Adjusting the Context Window
The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.

View file

@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *
__version__ = "0.2.44"
__version__ = "0.2.43"

View file

@ -98,7 +98,7 @@ class Llama:
lora_scale: float = 1.0,
lora_path: Optional[str] = None,
# Backend Params
numa: Union[bool, int] = False,
numa: bool = False,
# Chat Format Params
chat_format: Optional[str] = None,
chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
@ -166,7 +166,7 @@ class Llama:
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
lora_path: Path to a LoRA file to apply to the model.
numa: numa policy
numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
chat_format: String specifying the chat format to use when calling create_chat_completion.
chat_handler: Optional chat handler to use when calling create_chat_completion.
draft_model: Optional draft model to use for speculative decoding.
@ -183,20 +183,12 @@ class Llama:
set_verbose(verbose)
self.numa = numa
if not Llama.__backend_initialized:
with suppress_stdout_stderr(disable=verbose):
llama_cpp.llama_backend_init()
llama_cpp.llama_backend_init(self.numa)
Llama.__backend_initialized = True
if isinstance(numa, bool):
self.numa = llama_cpp.GGML_NUMA_STRATEGY_DISTRIBUTE if numa else llama_cpp.GGML_NUMA_STRATEGY_DISABLED
else:
self.numa = numa
if self.numa != llama_cpp.GGML_NUMA_STRATEGY_DISABLED:
with suppress_stdout_stderr(disable=verbose):
llama_cpp.llama_numa_init(self.numa)
self.model_path = model_path
# Model Params
@ -728,8 +720,6 @@ class Llama:
assert self._model.model is not None
model_name: str = model if model is not None else self.model_path
input = input if isinstance(input, list) else [input]
# get numeric embeddings
embeds: List[List[float]]
total_tokens: int
@ -772,7 +762,7 @@ class Llama:
"""
assert self._ctx.ctx is not None
n_embd = self.n_embd()
n_batch = self.n_batch
n_ctx = self.n_ctx()
if self.context_params.embedding == False:
raise RuntimeError(
@ -792,55 +782,54 @@ class Llama:
# decode and fetch embeddings
data: List[List[float]] = []
def decode_batch(n_seq: int):
def decode_batch(sizes: List[int]):
assert self._ctx.ctx is not None
llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
self._ctx.decode(self._batch)
self._batch.reset()
# store embeddings
for i in range(n_seq):
embedding: List[float] = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[
for i, s in enumerate(sizes):
embedding = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[
:n_embd
]
if normalize:
norm = float(np.linalg.norm(embedding))
embedding = [v / norm for v in embedding]
norm = np.linalg.norm(embedding) if normalize else s
embedding: List[float] = [v / float(norm) for v in embedding]
data.append(embedding)
# init state
total_tokens = 0
t_batch = 0
p_batch = 0
s_sizes: List[int] = []
# accumulate batches and encode
for text in inputs:
tokens = self.tokenize(text.encode("utf-8"))
if truncate:
tokens = tokens[:n_batch]
tokens = tokens[:n_ctx]
n_tokens = len(tokens)
total_tokens += n_tokens
# check for overrun
if n_tokens > n_batch:
if n_tokens > n_ctx:
raise ValueError(
f"Requested tokens ({n_tokens}) exceed batch size of {n_batch}"
f"Requested tokens ({n_tokens}) exceed context window of {n_ctx}"
)
# time to eval batch
if t_batch + n_tokens > n_batch:
decode_batch(p_batch)
if t_batch + n_tokens > self._n_ctx:
decode_batch(s_sizes)
t_batch = 0
p_batch = 0
s_sizes = []
# add to batch
self._batch.add_sequence(tokens, p_batch, False)
self._batch.add_sequence(tokens, len(s_sizes), False)
t_batch += n_tokens
p_batch += 1
s_sizes.append(n_tokens)
# hanlde last batch
decode_batch(p_batch)
decode_batch(s_sizes)
if self.verbose:
llama_cpp.llama_print_timings(self._ctx.ctx)

View file

@ -190,7 +190,6 @@ LLAMA_TOKEN_TYPE_BYTE = 6
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
# };
@ -216,7 +215,6 @@ LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23
LLAMA_FTYPE_MOSTLY_IQ1_S = 24
LLAMA_FTYPE_GUESSED = 1024
# enum llama_rope_scaling_type {
@ -232,15 +230,6 @@ LLAMA_ROPE_SCALING_LINEAR = 1
LLAMA_ROPE_SCALING_YARN = 2
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
# enum llama_pooling_type {
# LLAMA_POOLING_NONE = 0,
# LLAMA_POOLING_MEAN = 1,
# LLAMA_POOLING_CLS = 2,
# };
LLAMA_POOLING_NONE = 0
LLAMA_POOLING_MEAN = 1
LLAMA_POOLING_CLS = 2
# enum llama_split_mode {
# LLAMA_SPLIT_NONE = 0, // single GPU
# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
@ -664,18 +653,6 @@ class llama_timings(Structure):
]
# // used in chat template
# typedef struct llama_chat_message {
# const char * role;
# const char * content;
# } llama_chat_message;
class llama_chat_message(Structure):
_fields_ = [
("role", c_char_p),
("content", c_char_p),
]
# // Helpers for getting default parameters
# LLAMA_API struct llama_model_params llama_model_default_params(void);
def llama_model_default_params() -> llama_model_params:
@ -711,45 +688,17 @@ _lib.llama_model_quantize_default_params.restype = llama_model_quantize_params
# // If numa is true, use NUMA optimizations
# // Call once at the start of the program
# LLAMA_API void llama_backend_init(bool numa);
# LLAMA_API void llama_backend_init(void);
def llama_backend_init():
def llama_backend_init(numa: Union[c_bool, bool]):
"""Initialize the llama + ggml backend
If numa is true, use NUMA optimizations
Call once at the start of the program"""
return _lib.llama_backend_init()
return _lib.llama_backend_init(numa)
_lib.llama_backend_init.argtypes = []
_lib.llama_backend_init.argtypes = [c_bool]
_lib.llama_backend_init.restype = None
# // numa strategies
# enum ggml_numa_strategy {
# GGML_NUMA_STRATEGY_DISABLED = 0,
# GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
# GGML_NUMA_STRATEGY_ISOLATE = 2,
# GGML_NUMA_STRATEGY_NUMACTL = 3,
# GGML_NUMA_STRATEGY_MIRROR = 4,
# GGML_NUMA_STRATEGY_COUNT
# };
GGML_NUMA_STRATEGY_DISABLED = 0
GGML_NUMA_STRATEGY_DISTRIBUTE = 1
GGML_NUMA_STRATEGY_ISOLATE = 2
GGML_NUMA_STRATEGY_NUMACTL = 3
GGML_NUMA_STRATEGY_MIRROR = 4
GGML_NUMA_STRATEGY_COUNT = 5
# //optional:
# LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
def llama_numa_init(numa: int):
return _lib.llama_numa_init(numa)
_lib.llama_numa_init.argtypes = [c_int]
_lib.llama_numa_init.restype = None
# // Call once at the end of the program - currently only used for MPI
# LLAMA_API void llama_backend_free(void);
def llama_backend_free():
@ -1968,47 +1917,6 @@ _lib.llama_token_to_piece.argtypes = [llama_model_p, llama_token, c_char_p, c_in
_lib.llama_token_to_piece.restype = c_int32
# /// Apply chat template. Inspired by hf apply_chat_template() on python.
# /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
# /// NOTE: This function only support some known jinja templates. It is not a jinja parser.
# /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the models default chat template will be used instead.
# /// @param chat Pointer to a list of multiple llama_chat_message
# /// @param n_msg Number of llama_chat_message in this chat
# /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
# /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
# /// @param length The size of the allocated buffer
# /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
# LLAMA_API int32_t llama_chat_apply_template(
# const struct llama_model * model,
# const char * tmpl,
# const struct llama_chat_message * chat,
# size_t n_msg,
# bool add_ass,
# char * buf,
# int32_t length);
def llama_chat_apply_template(
model: llama_model_p,
tmpl: bytes,
chat: "ctypes._Pointer[llama_chat_message]",
n_msg: int,
) -> int:
return _lib.llama_chat_apply_template(
model,
tmpl,
chat,
n_msg
)
_lib.llama_chat_apply_template.argtypes = [
ctypes.c_void_p,
ctypes.c_char_p,
ctypes.POINTER(llama_chat_message),
ctypes.c_size_t
]
_lib.llama_chat_apply_template.restype = ctypes.c_int32
# //
# // Grammar
# //

View file

@ -290,7 +290,6 @@ async def create_completion(
inner_send_chan=send_chan,
iterator=iterator(),
),
sep='\n',
)
else:
return iterator_or_completion
@ -383,7 +382,6 @@ async def create_chat_completion(
inner_send_chan=send_chan,
iterator=iterator(),
),
sep='\n',
)
else:
return iterator_or_completion

View file

@ -2,7 +2,7 @@ from __future__ import annotations
import multiprocessing
from typing import Optional, List, Literal, Union
from typing import Optional, List, Literal
from pydantic import Field
from pydantic_settings import BaseSettings
@ -108,7 +108,7 @@ class ModelSettings(BaseSettings):
description="Path to a LoRA file to apply to the model.",
)
# Backend Params
numa: Union[bool, int] = Field(
numa: bool = Field(
default=False,
description="Enable NUMA support.",
)

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit f53119cec4f073b6d214195ecbe1fad3abdf2b34
Subproject commit 8084d554406b767d36b3250b3b787462d5dd626f