Merge https://github.com/abetlen/llama-cpp-python

feat: Update llama.cpp
2024-02-19 18:51:06 +05:30 · 2024-02-19 04:11:34 -05:00 · 2024-02-18 21:30:36 -05:00 · 2024-02-17 01:02:33 -05:00 · 2024-02-17 00:37:51 -05:00 · 2024-02-15 23:15:50 -05:00
8 changed files with 155 additions and 27 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ## [0.2.44]
 - feat: Update llama.cpp to ggerganov/llama.cpp@4524290e87b8e107cc2b56e1251751546f4b9051
 - fix: create_embedding broken response for input type str by @abetlen in 0ce66bc080fe537590b05b24bf442480bf2dd045
 - fix: Use '\n' seperator for EventSourceResponse by @khimaros in #1188
 - fix: Incorporate embedding pooling layer fixes by @iamlemec in #1194
 ## [0.2.43]
 - feat: Update llama.cpp to ggerganov/llama.cpp@8084d554406b767d36b3250b3b787462d5dd626f
--- a/README.md
+++ b/README.md
@ -398,6 +398,22 @@ llama = Llama(
 )
 ```
 ### Embeddings
 `llama-cpp-python` supports generating embeddings from the text.
 ```python
 import llama_cpp
 llm = llama_cpp.Llama(model_path="path/to/model.gguf", embeddings=True)
 embeddings = llm.create_embedding("Hello, world!")
 # or batched
 embeddings = llm.create_embedding(["Hello, world!", "Goodbye, world!"])
 ```
 ### Adjusting the Context Window
 The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.
--- a/llama_cpp/init.py
+++ b/llama_cpp/init.py
@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.43"
+__version__ = "0.2.44"
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -98,7 +98,7 @@ class Llama:
        lora_scale: float = 1.0,
        lora_path: Optional[str] = None,
        # Backend Params
-        numa: bool = False,
+        numa: Union[bool, int] = False,
        # Chat Format Params
        chat_format: Optional[str] = None,
        chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
@ -166,7 +166,7 @@ class Llama:
            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
            lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
            lora_path: Path to a LoRA file to apply to the model.
-            numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
+            numa: numa policy
            chat_format: String specifying the chat format to use when calling create_chat_completion.
            chat_handler: Optional chat handler to use when calling create_chat_completion.
            draft_model: Optional draft model to use for speculative decoding.
@ -183,12 +183,20 @@ class Llama:
        set_verbose(verbose)
        self.numa = numa
        if not Llama.__backend_initialized:
            with suppress_stdout_stderr(disable=verbose):
-                llama_cpp.llama_backend_init(self.numa)
+                llama_cpp.llama_backend_init()
            Llama.__backend_initialized = True
        if isinstance(numa, bool):
            self.numa = llama_cpp.GGML_NUMA_STRATEGY_DISTRIBUTE if numa else llama_cpp.GGML_NUMA_STRATEGY_DISABLED
        else:
            self.numa = numa
        if self.numa != llama_cpp.GGML_NUMA_STRATEGY_DISABLED:
            with suppress_stdout_stderr(disable=verbose):
                llama_cpp.llama_numa_init(self.numa)
        self.model_path = model_path
        # Model Params
@ -720,6 +728,8 @@ class Llama:
        assert self._model.model is not None
        model_name: str = model if model is not None else self.model_path
        input = input if isinstance(input, list) else [input]
        # get numeric embeddings
        embeds: List[List[float]]
        total_tokens: int
@ -762,7 +772,7 @@ class Llama:
        """
        assert self._ctx.ctx is not None
        n_embd = self.n_embd()
-        n_ctx = self.n_ctx()
+        n_batch = self.n_batch
        if self.context_params.embedding == False:
            raise RuntimeError(
@ -782,54 +792,55 @@ class Llama:
        # decode and fetch embeddings
        data: List[List[float]] = []
-        def decode_batch(sizes: List[int]):
+        def decode_batch(n_seq: int):
            assert self._ctx.ctx is not None
            llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
            self._ctx.decode(self._batch)
            self._batch.reset()
            # store embeddings
-            for i, s in enumerate(sizes):
+            for i in range(n_seq):
-                embedding = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[
+                embedding: List[float] = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[
                    :n_embd
                ]
-                norm = np.linalg.norm(embedding) if normalize else s
+                if normalize:
-                embedding: List[float] = [v / float(norm) for v in embedding]
+                    norm = float(np.linalg.norm(embedding))
                    embedding = [v / norm for v in embedding]
                data.append(embedding)
        # init state
        total_tokens = 0
        t_batch = 0
-        s_sizes: List[int] = []
+        p_batch = 0
        # accumulate batches and encode
        for text in inputs:
            tokens = self.tokenize(text.encode("utf-8"))
            if truncate:
-                tokens = tokens[:n_ctx]
+                tokens = tokens[:n_batch]
            n_tokens = len(tokens)
            total_tokens += n_tokens
            # check for overrun
-            if n_tokens > n_ctx:
+            if n_tokens > n_batch:
                raise ValueError(
-                    f"Requested tokens ({n_tokens}) exceed context window of {n_ctx}"
+                    f"Requested tokens ({n_tokens}) exceed batch size of {n_batch}"
                )
            # time to eval batch
-            if t_batch + n_tokens > self._n_ctx:
+            if t_batch + n_tokens > n_batch:
-                decode_batch(s_sizes)
+                decode_batch(p_batch)
                t_batch = 0
-                s_sizes = []
+                p_batch = 0
            # add to batch
-            self._batch.add_sequence(tokens, len(s_sizes), False)
+            self._batch.add_sequence(tokens, p_batch, False)
            t_batch += n_tokens
-            s_sizes.append(n_tokens)
+            p_batch += 1
        # hanlde last batch
-        decode_batch(s_sizes)
+        decode_batch(p_batch)
        if self.verbose:
            llama_cpp.llama_print_timings(self._ctx.ctx)
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -190,6 +190,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
 #     LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_Q3_K_XS       = 22, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
 #     LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 # };
@ -215,6 +216,7 @@ LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
 LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
 LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22
 LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23
 LLAMA_FTYPE_MOSTLY_IQ1_S = 24
 LLAMA_FTYPE_GUESSED = 1024
 # enum llama_rope_scaling_type {
@ -230,6 +232,15 @@ LLAMA_ROPE_SCALING_LINEAR = 1
 LLAMA_ROPE_SCALING_YARN = 2
 LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
 # enum llama_pooling_type {
 #     LLAMA_POOLING_NONE = 0,
 #     LLAMA_POOLING_MEAN = 1,
 #     LLAMA_POOLING_CLS  = 2,
 # };
 LLAMA_POOLING_NONE = 0
 LLAMA_POOLING_MEAN = 1
 LLAMA_POOLING_CLS = 2
 # enum llama_split_mode {
 #     LLAMA_SPLIT_NONE    = 0, // single GPU
 #     LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
@ -653,6 +664,18 @@ class llama_timings(Structure):
    ]
 # // used in chat template
 # typedef struct llama_chat_message {
 #     const char * role;
 #     const char * content;
 # } llama_chat_message;
 class llama_chat_message(Structure):
    _fields_ = [
        ("role", c_char_p),
        ("content", c_char_p),
    ]
 # // Helpers for getting default parameters
 # LLAMA_API struct llama_model_params llama_model_default_params(void);
 def llama_model_default_params() -> llama_model_params:
@ -688,17 +711,45 @@ _lib.llama_model_quantize_default_params.restype = llama_model_quantize_params
 # // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
 # LLAMA_API void llama_backend_init(bool numa);
-def llama_backend_init(numa: Union[c_bool, bool]):
+# LLAMA_API void llama_backend_init(void);
 def llama_backend_init():
    """Initialize the llama + ggml backend
    If numa is true, use NUMA optimizations
    Call once at the start of the program"""
-    return _lib.llama_backend_init(numa)
+    return _lib.llama_backend_init()
-_lib.llama_backend_init.argtypes = [c_bool]
+_lib.llama_backend_init.argtypes = []
 _lib.llama_backend_init.restype = None
 # // numa strategies
 # enum ggml_numa_strategy {
 #     GGML_NUMA_STRATEGY_DISABLED   = 0,
 #     GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
 #     GGML_NUMA_STRATEGY_ISOLATE    = 2,
 #     GGML_NUMA_STRATEGY_NUMACTL    = 3,
 #     GGML_NUMA_STRATEGY_MIRROR     = 4,
 #     GGML_NUMA_STRATEGY_COUNT
 # };
 GGML_NUMA_STRATEGY_DISABLED = 0
 GGML_NUMA_STRATEGY_DISTRIBUTE = 1
 GGML_NUMA_STRATEGY_ISOLATE = 2
 GGML_NUMA_STRATEGY_NUMACTL = 3
 GGML_NUMA_STRATEGY_MIRROR = 4
 GGML_NUMA_STRATEGY_COUNT = 5
 # //optional:
 # LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
 def llama_numa_init(numa: int):
    return _lib.llama_numa_init(numa)
 _lib.llama_numa_init.argtypes = [c_int]
 _lib.llama_numa_init.restype = None
 # // Call once at the end of the program - currently only used for MPI
 # LLAMA_API void llama_backend_free(void);
 def llama_backend_free():
@ -1917,6 +1968,47 @@ _lib.llama_token_to_piece.argtypes = [llama_model_p, llama_token, c_char_p, c_in
 _lib.llama_token_to_piece.restype = c_int32
 # /// Apply chat template. Inspired by hf apply_chat_template() on python.
 # /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
 # /// NOTE: This function only support some known jinja templates. It is not a jinja parser.
 # /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
 # /// @param chat Pointer to a list of multiple llama_chat_message
 # /// @param n_msg Number of llama_chat_message in this chat
 # /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
 # /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
 # /// @param length The size of the allocated buffer
 # /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
 # LLAMA_API int32_t llama_chat_apply_template(
 #           const struct llama_model * model,
 #                         const char * tmpl,
 #    const struct llama_chat_message * chat,
 #                             size_t   n_msg,
 #                               bool   add_ass,
 #                               char * buf,
 #                            int32_t   length);
 def llama_chat_apply_template(
        model: llama_model_p,
        tmpl: bytes,
        chat: "ctypes._Pointer[llama_chat_message]",
        n_msg: int,
 ) -> int:
    return _lib.llama_chat_apply_template(
        model,
        tmpl,
        chat,
        n_msg
    )
 _lib.llama_chat_apply_template.argtypes = [
    ctypes.c_void_p,
    ctypes.c_char_p,
    ctypes.POINTER(llama_chat_message),
    ctypes.c_size_t
 ]
 _lib.llama_chat_apply_template.restype = ctypes.c_int32
 # //
 # // Grammar
 # //
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@ -290,6 +290,7 @@ async def create_completion(
                inner_send_chan=send_chan,
                iterator=iterator(),
            ),
            sep='\n',
        )
    else:
        return iterator_or_completion
@ -382,6 +383,7 @@ async def create_chat_completion(
                inner_send_chan=send_chan,
                iterator=iterator(),
            ),
            sep='\n',
        )
    else:
        return iterator_or_completion
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@ -2,7 +2,7 @@ from __future__ import annotations
 import multiprocessing
-from typing import Optional, List, Literal
+from typing import Optional, List, Literal, Union
 from pydantic import Field
 from pydantic_settings import BaseSettings
@ -108,7 +108,7 @@ class ModelSettings(BaseSettings):
        description="Path to a LoRA file to apply to the model.",
    )
    # Backend Params
-    numa: bool = Field(
+    numa: Union[bool, int] = Field(
        default=False,
        description="Enable NUMA support.",
    )
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 8084d554406b767d36b3250b3b787462d5dd626f
+Subproject commit f53119cec4f073b6d214195ecbe1fad3abdf2b34
Author	SHA1	Message	Date
baalajimaestro	2264fbf750	Merge https://github.com/abetlen/llama-cpp-python	2024-02-19 18:51:06 +05:30
Andrei Betlen	6225f027e5	feat: Update llama.cpp	2024-02-19 04:11:34 -05:00
Andrei Betlen	748c0ce057	feat: Update llama.cpp	2024-02-18 21:30:36 -05:00
Andrei Betlen	53f6f5f415	fix: self.numa missing	2024-02-17 01:02:33 -05:00
Andrei Betlen	fdce078cb9	feat: Update llama.cpp	2024-02-17 00:37:51 -05:00
Andrei Betlen	c2a234a086	docs: Add embeddings section	2024-02-15 23:15:50 -05:00
Andrei Betlen	f736827b9b	chore: Bump version	2024-02-15 23:10:50 -05:00
Andrei Betlen	0ce66bc080	fix: create_embedding broken response for input type str	2024-02-15 16:09:48 -05:00
khimaros	ea1f88dd29	fix: Use '\n' seperator for EventSourceResponse (#1188 ) this fixes compatibility with some OpenAI clients, including BetterChatGPT (https://github.com/ztjhz/BetterChatGPT/issues/537). Co-authored-by: Andrei <abetlen@gmail.com>	2024-02-15 15:20:13 -05:00
Andrei Betlen	a5cfeb7763	feat: Update llama.cpp	2024-02-15 15:17:30 -05:00
Douglas Hanley	7bb91f025f	fix: Incorporate embedding pooling layer fixes (#1194 ) * remove division by token count * truncate to n_batch, not n_ctx	2024-02-15 15:16:30 -05:00
		`@ -1 +1 @@`
			`Subproject commit 8084d554406b767d36b3250b3b787462d5dd626f`				`Subproject commit f53119cec4f073b6d214195ecbe1fad3abdf2b34`