Merge https://github.com/abetlen/llama-cpp-python

2024-02-19 18:51:06 +05:30 · 2024-02-19 18:51:06 +05:30 · 2264fbf750
commit 2264fbf750
parent 21ac214a38 6225f027e5
8 changed files with 155 additions and 27 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

+## [0.2.44]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@4524290e87b8e107cc2b56e1251751546f4b9051
+- fix: create_embedding broken response for input type str by @abetlen in 0ce66bc080fe537590b05b24bf442480bf2dd045
+- fix: Use '\n' seperator for EventSourceResponse by @khimaros in #1188
+- fix: Incorporate embedding pooling layer fixes by @iamlemec in #1194
+
 ## [0.2.43]

 - feat: Update llama.cpp to ggerganov/llama.cpp@8084d554406b767d36b3250b3b787462d5dd626f
--- a/README.md
+++ b/README.md
@ -398,6 +398,22 @@ llama = Llama(
 )
 ```

+### Embeddings
+
+`llama-cpp-python` supports generating embeddings from the text.
+
+```python
+import llama_cpp
+
+llm = llama_cpp.Llama(model_path="path/to/model.gguf", embeddings=True)
+
+embeddings = llm.create_embedding("Hello, world!")
+
+# or batched
+
+embeddings = llm.create_embedding(["Hello, world!", "Goodbye, world!"])
+```
+
 ### Adjusting the Context Window

 The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.
--- a/llama_cpp/init.py
+++ b/llama_cpp/init.py
@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *

-__version__ = "0.2.43"
+__version__ = "0.2.44"
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -98,7 +98,7 @@ class Llama:
        lora_scale: float = 1.0,
        lora_path: Optional[str] = None,
        # Backend Params
-        numa: bool = False,
+        numa: Union[bool, int] = False,
        # Chat Format Params
        chat_format: Optional[str] = None,
        chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
@ -166,7 +166,7 @@ class Llama:
            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
            lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
            lora_path: Path to a LoRA file to apply to the model.
-            numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
+            numa: numa policy
            chat_format: String specifying the chat format to use when calling create_chat_completion.
            chat_handler: Optional chat handler to use when calling create_chat_completion.
            draft_model: Optional draft model to use for speculative decoding.
@ -183,12 +183,20 @@ class Llama:

        set_verbose(verbose)

-        self.numa = numa
        if not Llama.__backend_initialized:
            with suppress_stdout_stderr(disable=verbose):
-                llama_cpp.llama_backend_init(self.numa)
+                llama_cpp.llama_backend_init()
            Llama.__backend_initialized = True

+        if isinstance(numa, bool):
+            self.numa = llama_cpp.GGML_NUMA_STRATEGY_DISTRIBUTE if numa else llama_cpp.GGML_NUMA_STRATEGY_DISABLED
+        else:
+            self.numa = numa
+
+        if self.numa != llama_cpp.GGML_NUMA_STRATEGY_DISABLED:
+            with suppress_stdout_stderr(disable=verbose):
+                llama_cpp.llama_numa_init(self.numa)
+
        self.model_path = model_path

        # Model Params
@ -720,6 +728,8 @@ class Llama:
        assert self._model.model is not None
        model_name: str = model if model is not None else self.model_path

+        input = input if isinstance(input, list) else [input]
+
        # get numeric embeddings
        embeds: List[List[float]]
        total_tokens: int
@ -762,7 +772,7 @@ class Llama:
        """
        assert self._ctx.ctx is not None
        n_embd = self.n_embd()
-        n_ctx = self.n_ctx()
+        n_batch = self.n_batch

        if self.context_params.embedding == False:
            raise RuntimeError(
@ -782,54 +792,55 @@ class Llama:

        # decode and fetch embeddings
        data: List[List[float]] = []
-        def decode_batch(sizes: List[int]):
+        def decode_batch(n_seq: int):
            assert self._ctx.ctx is not None
            llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
            self._ctx.decode(self._batch)
            self._batch.reset()

            # store embeddings
-            for i, s in enumerate(sizes):
-                embedding = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[
+            for i in range(n_seq):
+                embedding: List[float] = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[
                    :n_embd
                ]
-                norm = np.linalg.norm(embedding) if normalize else s
-                embedding: List[float] = [v / float(norm) for v in embedding]
+                if normalize:
+                    norm = float(np.linalg.norm(embedding))
+                    embedding = [v / norm for v in embedding]
                data.append(embedding)

        # init state
        total_tokens = 0
        t_batch = 0
-        s_sizes: List[int] = []
+        p_batch = 0

        # accumulate batches and encode
        for text in inputs:
            tokens = self.tokenize(text.encode("utf-8"))
            if truncate:
-                tokens = tokens[:n_ctx]
+                tokens = tokens[:n_batch]

            n_tokens = len(tokens)
            total_tokens += n_tokens

            # check for overrun
-            if n_tokens > n_ctx:
+            if n_tokens > n_batch:
                raise ValueError(
-                    f"Requested tokens ({n_tokens}) exceed context window of {n_ctx}"
+                    f"Requested tokens ({n_tokens}) exceed batch size of {n_batch}"
                )

            # time to eval batch
-            if t_batch + n_tokens > self._n_ctx:
-                decode_batch(s_sizes)
+            if t_batch + n_tokens > n_batch:
+                decode_batch(p_batch)
                t_batch = 0
-                s_sizes = []
+                p_batch = 0

            # add to batch
-            self._batch.add_sequence(tokens, len(s_sizes), False)
+            self._batch.add_sequence(tokens, p_batch, False)
            t_batch += n_tokens
-            s_sizes.append(n_tokens)
+            p_batch += 1

        # hanlde last batch
-        decode_batch(s_sizes)
+        decode_batch(p_batch)

        if self.verbose:
            llama_cpp.llama_print_timings(self._ctx.ctx)
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -190,6 +190,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
 #     LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_Q3_K_XS       = 22, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors

 #     LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 # };
@ -215,6 +216,7 @@ LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
 LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
 LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22
 LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23
+LLAMA_FTYPE_MOSTLY_IQ1_S = 24
 LLAMA_FTYPE_GUESSED = 1024

 # enum llama_rope_scaling_type {
@ -230,6 +232,15 @@ LLAMA_ROPE_SCALING_LINEAR = 1
 LLAMA_ROPE_SCALING_YARN = 2
 LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN

+# enum llama_pooling_type {
+#     LLAMA_POOLING_NONE = 0,
+#     LLAMA_POOLING_MEAN = 1,
+#     LLAMA_POOLING_CLS  = 2,
+# };
+LLAMA_POOLING_NONE = 0
+LLAMA_POOLING_MEAN = 1
+LLAMA_POOLING_CLS = 2
+
 # enum llama_split_mode {
 #     LLAMA_SPLIT_NONE    = 0, // single GPU
 #     LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
@ -653,6 +664,18 @@ class llama_timings(Structure):
    ]


+# // used in chat template
+# typedef struct llama_chat_message {
+#     const char * role;
+#     const char * content;
+# } llama_chat_message;
+class llama_chat_message(Structure):
+    _fields_ = [
+        ("role", c_char_p),
+        ("content", c_char_p),
+    ]
+
+
 # // Helpers for getting default parameters
 # LLAMA_API struct llama_model_params llama_model_default_params(void);
 def llama_model_default_params() -> llama_model_params:
@ -688,17 +711,45 @@ _lib.llama_model_quantize_default_params.restype = llama_model_quantize_params
 # // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
 # LLAMA_API void llama_backend_init(bool numa);
-def llama_backend_init(numa: Union[c_bool, bool]):
+# LLAMA_API void llama_backend_init(void);
+def llama_backend_init():
    """Initialize the llama + ggml backend
    If numa is true, use NUMA optimizations
    Call once at the start of the program"""
-    return _lib.llama_backend_init(numa)
+    return _lib.llama_backend_init()


-_lib.llama_backend_init.argtypes = [c_bool]
+_lib.llama_backend_init.argtypes = []
 _lib.llama_backend_init.restype = None


+# // numa strategies
+# enum ggml_numa_strategy {
+#     GGML_NUMA_STRATEGY_DISABLED   = 0,
+#     GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
+#     GGML_NUMA_STRATEGY_ISOLATE    = 2,
+#     GGML_NUMA_STRATEGY_NUMACTL    = 3,
+#     GGML_NUMA_STRATEGY_MIRROR     = 4,
+#     GGML_NUMA_STRATEGY_COUNT
+# };
+GGML_NUMA_STRATEGY_DISABLED = 0
+GGML_NUMA_STRATEGY_DISTRIBUTE = 1
+GGML_NUMA_STRATEGY_ISOLATE = 2
+GGML_NUMA_STRATEGY_NUMACTL = 3
+GGML_NUMA_STRATEGY_MIRROR = 4
+GGML_NUMA_STRATEGY_COUNT = 5
+
+
+# //optional:
+# LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
+def llama_numa_init(numa: int):
+    return _lib.llama_numa_init(numa)
+
+
+_lib.llama_numa_init.argtypes = [c_int]
+_lib.llama_numa_init.restype = None
+
+
 # // Call once at the end of the program - currently only used for MPI
 # LLAMA_API void llama_backend_free(void);
 def llama_backend_free():
@ -1917,6 +1968,47 @@ _lib.llama_token_to_piece.argtypes = [llama_model_p, llama_token, c_char_p, c_in
 _lib.llama_token_to_piece.restype = c_int32


+# /// Apply chat template. Inspired by hf apply_chat_template() on python.
+# /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
+# /// NOTE: This function only support some known jinja templates. It is not a jinja parser.
+# /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
+# /// @param chat Pointer to a list of multiple llama_chat_message
+# /// @param n_msg Number of llama_chat_message in this chat
+# /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
+# /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
+# /// @param length The size of the allocated buffer
+# /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
+# LLAMA_API int32_t llama_chat_apply_template(
+#           const struct llama_model * model,
+#                         const char * tmpl,
+#    const struct llama_chat_message * chat,
+#                             size_t   n_msg,
+#                               bool   add_ass,
+#                               char * buf,
+#                            int32_t   length);
+def llama_chat_apply_template(
+        model: llama_model_p,
+        tmpl: bytes,
+        chat: "ctypes._Pointer[llama_chat_message]",
+        n_msg: int,
+) -> int:
+    return _lib.llama_chat_apply_template(
+        model,
+        tmpl,
+        chat,
+        n_msg
+    )
+
+_lib.llama_chat_apply_template.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_char_p,
+    ctypes.POINTER(llama_chat_message),
+    ctypes.c_size_t
+]
+_lib.llama_chat_apply_template.restype = ctypes.c_int32
+
+
+
 # //
 # // Grammar
 # //
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@ -290,6 +290,7 @@ async def create_completion(
                inner_send_chan=send_chan,
                iterator=iterator(),
            ),
+            sep='\n',
        )
    else:
        return iterator_or_completion
@ -382,6 +383,7 @@ async def create_chat_completion(
                inner_send_chan=send_chan,
                iterator=iterator(),
            ),
+            sep='\n',
        )
    else:
        return iterator_or_completion
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@ -2,7 +2,7 @@ from __future__ import annotations

 import multiprocessing

-from typing import Optional, List, Literal
+from typing import Optional, List, Literal, Union
 from pydantic import Field
 from pydantic_settings import BaseSettings

@ -108,7 +108,7 @@ class ModelSettings(BaseSettings):
        description="Path to a LoRA file to apply to the model.",
    )
    # Backend Params
-    numa: bool = Field(
+    numa: Union[bool, int] = Field(
        default=False,
        description="Enable NUMA support.",
    )
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 8084d554406b767d36b3250b3b787462d5dd626f
+Subproject commit f53119cec4f073b6d214195ecbe1fad3abdf2b34