From eb9c7d4ed8984bdff6585e38d04e7d17bf14155e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 3 Jan 2024 22:04:04 -0500
Subject: [PATCH 1/6] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 138 ++++++++++++++++++++++-------------------
 vendor/llama.cpp       |   2 +-
 2 files changed, 74 insertions(+), 66 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 75800c0..4aada53 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -93,6 +93,9 @@ c_size_t_p = POINTER(c_size_t)
 
 # llama.h bindings
 
+_lib.llama_max_devices.argtypes = []
+_lib.llama_max_devices.restype = ctypes.c_int32
+
 LLAMA_MAX_DEVICES = _lib.llama_max_devices()
 
 # define LLAMA_DEFAULT_SEED 0xFFFFFFFF
@@ -481,7 +484,7 @@ It might not exist for progress report where '.' is output repeatedly."""
 
 # // model quantization parameters
 # typedef struct llama_model_quantize_params {
-#     int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+#     int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
 #     enum llama_ftype ftype;      // quantize to this llama_ftype
 #     bool allow_requantize;       // allow quantizing non-f32/f16 tensors
 #     bool quantize_output_tensor; // quantize output.weight
@@ -499,7 +502,7 @@ class llama_model_quantize_params(Structure):
         only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
         pure (bool): disable k-quant mixtures and quantize all tensors to the same type"""
     _fields_ = [
-        ("nthread", c_int),
+        ("nthread", c_int32),
         ("ftype", c_int),
         ("allow_requantize", c_bool),
         ("quantize_output_tensor", c_bool),
@@ -698,13 +701,13 @@ _lib.llama_time_us.argtypes = []
 _lib.llama_time_us.restype = ctypes.c_int64
 
 
-# LLAMA_API int  llama_max_devices    (void);
+# LLAMA_API int32_t  llama_max_devices(void);
 def llama_max_devices() -> int:
     return _lib.llama_max_devices()
 
 
 _lib.llama_max_devices.argtypes = []
-_lib.llama_max_devices.restype = c_int
+_lib.llama_max_devices.restype = ctypes.c_int32
 
 
 # LLAMA_API bool llama_mmap_supported (void);
@@ -734,7 +737,7 @@ _lib.llama_get_model.argtypes = [llama_context_p]
 _lib.llama_get_model.restype = llama_model_p
 
 
-# LLAMA_API int llama_n_ctx      (const struct llama_context * ctx);
+# LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
 def llama_n_ctx(ctx: llama_context_p) -> int:
     return _lib.llama_n_ctx(ctx)
 
@@ -758,31 +761,31 @@ _lib.llama_vocab_type.argtypes = [llama_model_p]
 _lib.llama_vocab_type.restype = c_int
 
 
-# LLAMA_API int llama_n_vocab    (const struct llama_model * model);
+# LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
 def llama_n_vocab(model: llama_model_p) -> int:
     return _lib.llama_n_vocab(model)
 
 
 _lib.llama_n_vocab.argtypes = [llama_model_p]
-_lib.llama_n_vocab.restype = c_int
+_lib.llama_n_vocab.restype = c_int32
 
 
-# LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
+# LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
 def llama_n_ctx_train(model: llama_model_p) -> int:
     return _lib.llama_n_ctx_train(model)
 
 
 _lib.llama_n_ctx_train.argtypes = [llama_model_p]
-_lib.llama_n_ctx_train.restype = c_int
+_lib.llama_n_ctx_train.restype = c_int32
 
 
-# LLAMA_API int llama_n_embd     (const struct llama_model * model);
+# LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
 def llama_n_embd(model: llama_model_p) -> int:
     return _lib.llama_n_embd(model)
 
 
 _lib.llama_n_embd.argtypes = [llama_model_p]
-_lib.llama_n_embd.restype = c_int
+_lib.llama_n_embd.restype = c_int32
 
 
 # // Get the model's RoPE frequency scaling factor
@@ -802,7 +805,7 @@ _lib.llama_rope_freq_scale_train.restype = c_float
 
 
 # // Get metadata value as a string by key name
-# LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
+# LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
 def llama_model_meta_val_str(
     model: llama_model_p, key: Union[c_char_p, bytes], buf: bytes, buf_size: int
 ) -> int:
@@ -811,22 +814,22 @@ def llama_model_meta_val_str(
 
 
 _lib.llama_model_meta_val_str.argtypes = [llama_model_p, c_char_p, c_char_p, c_size_t]
-_lib.llama_model_meta_val_str.restype = c_int
+_lib.llama_model_meta_val_str.restype = c_int32
 
 
 # // Get the number of metadata key/value pairs
-# LLAMA_API int llama_model_meta_count(const struct llama_model * model);
+# LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
 def llama_model_meta_count(model: llama_model_p) -> int:
     """Get the number of metadata key/value pairs"""
     return _lib.llama_model_meta_count(model)
 
 
 _lib.llama_model_meta_count.argtypes = [llama_model_p]
-_lib.llama_model_meta_count.restype = c_int
+_lib.llama_model_meta_count.restype = c_int32
 
 
 # // Get metadata key name by index
-# LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+# LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
 def llama_model_meta_key_by_index(
     model: llama_model_p, i: Union[c_int, int], buf: bytes, buf_size: int
 ) -> int:
@@ -834,12 +837,17 @@ def llama_model_meta_key_by_index(
     return _lib.llama_model_meta_key_by_index(model, i, buf, buf_size)
 
 
-_lib.llama_model_meta_key_by_index.argtypes = [llama_model_p, c_int, c_char_p, c_size_t]
-_lib.llama_model_meta_key_by_index.restype = c_int
+_lib.llama_model_meta_key_by_index.argtypes = [
+    llama_model_p,
+    c_int32,
+    c_char_p,
+    c_size_t,
+]
+_lib.llama_model_meta_key_by_index.restype = c_int32
 
 
 # // Get metadata value as a string by index
-# LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+# LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
 def llama_model_meta_val_str_by_index(
     model: llama_model_p, i: Union[c_int, int], buf: bytes, buf_size: int
 ) -> int:
@@ -849,15 +857,15 @@ def llama_model_meta_val_str_by_index(
 
 _lib.llama_model_meta_val_str_by_index.argtypes = [
     llama_model_p,
-    c_int,
+    c_int32,
     c_char_p,
     c_size_t,
 ]
-_lib.llama_model_meta_val_str_by_index.restype = c_int
+_lib.llama_model_meta_val_str_by_index.restype = c_int32
 
 
 # // Get a string describing the model type
-# LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+# LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
 def llama_model_desc(
     model: llama_model_p, buf: bytes, buf_size: Union[c_size_t, int]
 ) -> int:
@@ -866,7 +874,7 @@ def llama_model_desc(
 
 
 _lib.llama_model_desc.argtypes = [llama_model_p, c_char_p, c_size_t]
-_lib.llama_model_desc.restype = c_int
+_lib.llama_model_desc.restype = c_int32
 
 
 # // Returns the total size of all the tensors in the model in bytes
@@ -905,7 +913,7 @@ _lib.llama_get_model_tensor.restype = c_void_p
 
 
 # // Returns 0 on success
-# LLAMA_API int llama_model_quantize(
+# LLAMA_API uint32_t llama_model_quantize(
 #         const char * fname_inp,
 #         const char * fname_out,
 #         const llama_model_quantize_params * params);
@@ -923,7 +931,7 @@ _lib.llama_model_quantize.argtypes = [
     c_char_p,
     POINTER(llama_model_quantize_params),
 ]
-_lib.llama_model_quantize.restype = c_int
+_lib.llama_model_quantize.restype = c_uint32
 
 
 # // Apply a LoRA adapter to a loaded model
@@ -932,12 +940,12 @@ _lib.llama_model_quantize.restype = c_int
 # // The model needs to be reloaded before applying a new adapter, otherwise the adapter
 # // will be applied on top of the previous one
 # // Returns 0 on success
-# LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
+# LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
 #         struct llama_context * ctx,
 #                   const char * path_lora,
 #                        float   scale,
 #                   const char * path_base_model,
-#                          int   n_threads),
+#                      int32_t   n_threads),
 #         "use llama_model_apply_lora_from_file instead");
 def llama_apply_lora_from_file(
     ctx: llama_context_p,
@@ -962,17 +970,17 @@ _lib.llama_apply_lora_from_file.argtypes = [
     c_char_p,
     c_float,
     c_char_p,
-    c_int,
+    c_int32,
 ]
-_lib.llama_apply_lora_from_file.restype = c_int
+_lib.llama_apply_lora_from_file.restype = c_int32
 
 
-# LLAMA_API int llama_model_apply_lora_from_file(
+# LLAMA_API int32_t llama_model_apply_lora_from_file(
 #         const struct llama_model * model,
 #                   const char * path_lora,
 #                        float   scale,
 #                   const char * path_base_model,
-#                          int   n_threads);
+#                      int32_t   n_threads);
 def llama_model_apply_lora_from_file(
     model: llama_model_p,
     path_lora: Union[c_char_p, bytes],
@@ -990,9 +998,9 @@ _lib.llama_model_apply_lora_from_file.argtypes = [
     c_char_p,
     c_float,
     c_char_p,
-    c_int,
+    c_int32,
 ]
-_lib.llama_model_apply_lora_from_file.restype = c_int
+_lib.llama_model_apply_lora_from_file.restype = c_int32
 
 # //
 # // KV cache
@@ -1094,7 +1102,7 @@ _lib.llama_kv_cache_view_update.restype = None
 
 # // Returns the number of tokens in the KV cache (slow, use only for debug)
 # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
+# LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
 def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
     """Returns the number of tokens in the KV cache (slow, use only for debug)
     If a KV cell has multiple sequences assigned to it, it will be counted multiple times
@@ -1103,18 +1111,18 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
 
 
 _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
-_lib.llama_get_kv_cache_token_count.restype = c_int
+_lib.llama_get_kv_cache_token_count.restype = c_int32
 
 
 # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-# LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+# LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
 def llama_get_kv_cache_used_cells(ctx: llama_context_p) -> int:
     """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
     return _lib.llama_get_kv_cache_used_cells(ctx)
 
 
 _lib.llama_get_kv_cache_used_cells.argtypes = [llama_context_p]
-_lib.llama_get_kv_cache_used_cells.restype = c_int
+_lib.llama_get_kv_cache_used_cells.restype = c_int32
 
 
 # // Clear the KV cache
@@ -1361,7 +1369,7 @@ _lib.llama_save_session_file.restype = c_size_t
 #         struct llama_context * ctx,
 #                  llama_token * tokens,
 #                      int32_t   n_tokens,
-#                          int   n_past),
+#                      int32_t   n_past),
 #         "use llama_decode() instead");
 def llama_eval(
     ctx: llama_context_p,
@@ -1377,7 +1385,7 @@ def llama_eval(
     return _lib.llama_eval(ctx, tokens, n_tokens, n_past)
 
 
-_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int]
+_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int32, c_int32]
 _lib.llama_eval.restype = c_int
 
 
@@ -1387,7 +1395,7 @@ _lib.llama_eval.restype = c_int
 #         struct llama_context * ctx,
 #                        float * embd,
 #                      int32_t   n_tokens,
-#                          int   n_past),
+#                      int32_t   n_past),
 #         "use llama_decode() instead");
 def llama_eval_embd(
     ctx: llama_context_p,
@@ -1400,7 +1408,7 @@ def llama_eval_embd(
     return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past)
 
 
-_lib.llama_eval_embd.argtypes = [llama_context_p, c_float_p, c_int, c_int]
+_lib.llama_eval_embd.argtypes = [llama_context_p, c_float_p, c_int32, c_int32]
 _lib.llama_eval_embd.restype = c_int
 
 
@@ -1480,7 +1488,7 @@ _lib.llama_batch_free.restype = None
 # //   0 - success
 # //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
 # // < 0 - error
-# LLAMA_API int llama_decode(
+# LLAMA_API int32_t llama_decode(
 #         struct llama_context * ctx,
 #           struct llama_batch   batch);
 def llama_decode(ctx: llama_context_p, batch: llama_batch) -> int:
@@ -1492,7 +1500,7 @@ def llama_decode(ctx: llama_context_p, batch: llama_batch) -> int:
 
 
 _lib.llama_decode.argtypes = [llama_context_p, llama_batch]
-_lib.llama_decode.restype = c_int
+_lib.llama_decode.restype = c_int32
 
 
 # // Set the number of threads used for decoding
@@ -1634,25 +1642,25 @@ _lib.llama_token_nl.restype = llama_token
 
 
 # // Returns -1 if unknown, 1 for true or 0 for false.
-# LLAMA_API int         llama_add_bos_token(const struct llama_model * model);
+# LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
 def llama_add_bos_token(model: llama_model_p) -> int:
     """Returns -1 if unknown, 1 for true or 0 for false."""
     return _lib.llama_add_bos_token(model)
 
 
 _lib.llama_add_bos_token.argtypes = [llama_model_p]
-_lib.llama_add_bos_token.restype = c_int
+_lib.llama_add_bos_token.restype = c_int32
 
 
 # // Returns -1 if unknown, 1 for true or 0 for false.
-# LLAMA_API int         llama_add_eos_token(const struct llama_model * model);
+# LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
 def llama_add_eos_token(model: llama_model_p) -> int:
     """Returns -1 if unknown, 1 for true or 0 for false."""
     return _lib.llama_add_eos_token(model)
 
 
 _lib.llama_add_eos_token.argtypes = [llama_model_p]
-_lib.llama_add_eos_token.restype = c_int
+_lib.llama_add_eos_token.restype = c_int32
 
 
 # // codellama infill tokens
@@ -1704,12 +1712,12 @@ _lib.llama_token_eot.restype = llama_token
 # /// @return Returns a negative number on failure - the number of tokens that would have been returned
 # /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
 # ///                Does not insert a leading space.
-# LLAMA_API int llama_tokenize(
+# LLAMA_API int32_t llama_tokenize(
 #     const struct llama_model * model,
 #                   const char * text,
-#                          int   text_len,
+#                      int32_t   text_len,
 #                  llama_token * tokens,
-#                          int   n_max_tokens,
+#                      int32_t   n_max_tokens,
 #                         bool   add_bos,
 #                         bool   special);
 def llama_tokenize(
@@ -1730,24 +1738,24 @@ def llama_tokenize(
 _lib.llama_tokenize.argtypes = [
     llama_model_p,
     c_char_p,
-    c_int,
+    c_int32,
     llama_token_p,
-    c_int,
+    c_int32,
     c_bool,
     c_bool,
 ]
-_lib.llama_tokenize.restype = c_int
+_lib.llama_tokenize.restype = c_int32
 
 
 # // Token Id -> Piece.
 # // Uses the vocabulary in the provided context.
 # // Does not write null terminator to the buffer.
 # // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
-# LLAMA_API int llama_token_to_piece(
+# LLAMA_API int32_t llama_token_to_piece(
 #           const struct llama_model * model,
 #                        llama_token   token,
 #                               char * buf,
-#                               int    length);
+#                            int32_t   length);
 def llama_token_to_piece(
     model: llama_model_p,
     token: Union[llama_token, int],
@@ -1762,8 +1770,8 @@ def llama_token_to_piece(
     return _lib.llama_token_to_piece(model, token, buf, length)
 
 
-_lib.llama_token_to_piece.argtypes = [llama_model_p, llama_token, c_char_p, c_int]
-_lib.llama_token_to_piece.restype = c_int
+_lib.llama_token_to_piece.argtypes = [llama_model_p, llama_token, c_char_p, c_int32]
+_lib.llama_token_to_piece.restype = c_int32
 
 
 # //
@@ -1924,7 +1932,7 @@ _lib.llama_sample_softmax.restype = None
 # LLAMA_API void llama_sample_top_k(
 #         struct llama_context * ctx,
 #       llama_token_data_array * candidates,
-#                          int   k,
+#                      int32_t   k,
 #                       size_t   min_keep);
 def llama_sample_top_k(
     ctx: llama_context_p,
@@ -1939,7 +1947,7 @@ def llama_sample_top_k(
 _lib.llama_sample_top_k.argtypes = [
     llama_context_p,
     llama_token_data_array_p,
-    c_int,
+    c_int32,
     c_size_t,
 ]
 _lib.llama_sample_top_k.restype = None
@@ -2129,7 +2137,7 @@ _lib.llama_sample_grammar.restype = None
 #       llama_token_data_array * candidates,
 #                        float   tau,
 #                        float   eta,
-#                          int   m,
+#                      int32_t   m,
 #                        float * mu);
 def llama_sample_token_mirostat(
     ctx: llama_context_p,
@@ -2155,7 +2163,7 @@ _lib.llama_sample_token_mirostat.argtypes = [
     llama_token_data_array_p,
     c_float,
     c_float,
-    c_int,
+    c_int32,
     c_float_p,
 ]
 _lib.llama_sample_token_mirostat.restype = llama_token
@@ -2320,8 +2328,8 @@ llama_beam_search_callback_fn_t = ctypes.CFUNCTYPE(None, c_void_p, llama_beams_s
 #     llama_beam_search_callback_fn_t   callback,
 #                                void * callback_data,
 #                              size_t   n_beams,
-#                                 int   n_past,
-#                                 int   n_predict);
+#                             int32_t   n_past,
+#                             int32_t   n_predict);
 def llama_beam_search(
     ctx: llama_context_p,
     callback: "ctypes._CFuncPtr[None, c_void_p, llama_beams_state]",  # type: ignore
@@ -2340,8 +2348,8 @@ _lib.llama_beam_search.argtypes = [
     llama_beam_search_callback_fn_t,
     c_void_p,
     c_size_t,
-    c_int,
-    c_int,
+    c_int32,
+    c_int32,
 ]
 _lib.llama_beam_search.restype = None
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f679349..cb1e281 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f6793491b5af6da75edad34d6f503ef86d31b09f
+Subproject commit cb1e2818e0e12ec99f7236ec5d4f3ffd8bcc2f4a

From cf743ec5d32cc84e68295da8442ccf3a64e635f1 Mon Sep 17 00:00:00 2001
From: xaviviro <xavi.vinaixa@gmail.com>
Date: Fri, 5 Jan 2024 00:12:02 +0100
Subject: [PATCH 2/6] Added ChatGLM chat format (#1059)

Co-authored-by: Xavier Vinaixa Rosello <xaviviro@MacBook-Pro-de-Xavier.local>
---
 llama_cpp/llama_chat_format.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 037f96a..6f402e0 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -172,6 +172,20 @@ def _format_chatml(
             ret += role + "\n"
     return ret
 
+def _format_chatglm3(
+    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str
+) -> str:
+    """Format the prompt with the chatglm3 style."""
+    ret = ""
+    if system_message:
+        ret += system_message
+    for role, message in messages:
+        if message:
+            ret += role + "\n" + " " + message
+        else:
+            ret += role
+    return ret
+
 
 @dataclasses.dataclass
 class ChatFormatterResponse:
@@ -685,6 +699,22 @@ def format_chatml(
     _prompt = _format_chatml(system_message, _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 
+@register_chat_format("chatglm3")
+def format_chatglm3(
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    **kwargs: Any,
+) -> ChatFormatterResponse:
+    system_template = """<|system|>
+{system_message}"""
+    system_message = _get_system_message(messages)
+    system_message = system_template.format(system_message=system_message)
+    _roles = dict(user="<|user|>", assistant="<|assistant|>")
+    _sep = "</s>"
+    _messages = _map_roles(messages, _roles)
+    _messages.append((_roles["assistant"], None))
+    _prompt = _format_chatglm3(system_message, _messages, _sep)
+    return ChatFormatterResponse(prompt=_prompt, stop=_sep)
+
 
 @register_chat_format("openchat")
 def format_openchat(

From f766b70c9a63801f6f27dc92b4ab822f92055bc9 Mon Sep 17 00:00:00 2001
From: Caleb Hoff <calebrhoff@gmail.com>
Date: Thu, 4 Jan 2024 17:12:32 -0600
Subject: [PATCH 3/6] Fix: Correct typo in README.md (#1058)

In Llama.create_chat_completion, the `tool_choice` property does not have an s on the end.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 97fe6e1..b2e879e 100644
--- a/README.md
+++ b/README.md
@@ -238,7 +238,7 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
           }
         }
       }],
-      tool_choices=[{
+      tool_choice=[{
         "type": "function",
         "function": {
           "name": "UserDetail"

From 907b9e9d4281336072519fbf11e885768ad0ff0b Mon Sep 17 00:00:00 2001
From: Fedor Moiseev <femoiseev@gmail.com>
Date: Fri, 5 Jan 2024 06:12:58 +0700
Subject: [PATCH 4/6] Add Saiga chat format. (#1050)

---
 llama_cpp/llama_chat_format.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 6f402e0..0ef7bd4 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -734,6 +734,28 @@ def format_openchat(
     return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 
 
+# Chat format for Saiga models, see more details and available models:
+# https://huggingface.co/collections/IlyaGusev/saiga2-saigamistral-6505d4ccc3d1e53166b636cd
+@register_chat_format("saiga")
+def format_saiga(
+    messages: list[llama_types.ChatCompletionRequestMessage],
+    **kwargs,
+) -> ChatFormatterResponse:
+    _message_template = "<s>{role}\n{content}</s>"
+    _roles = dict(user="user", bot="bot", system="system")
+    _messages = _map_roles(messages, _roles)
+
+    _prompt = ""
+    for role, content in _messages:
+        if content:
+            _prompt += _message_template.format(role=role, content=content)
+        else:
+            _prompt += f"<s>{role}\n"
+    # Response template
+    _prompt += "<s>bot"
+    return ChatFormatterResponse(prompt=_prompt.strip())
+
+
 @register_chat_completion_handler("functionary")
 def functionary_chat_handler(
     llama: llama.Llama,

From fffcd0181c2b58a084daebc6df659520d0c73337 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 4 Jan 2024 18:26:00 -0500
Subject: [PATCH 5/6] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index cb1e281..b3a7c20 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit cb1e2818e0e12ec99f7236ec5d4f3ffd8bcc2f4a
+Subproject commit b3a7c20b5c035250257d2b62851c379b159c899a

From 75d0527fd782a792af8612e55b0a3f2dad469ae9 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 4 Jan 2024 18:30:12 -0500
Subject: [PATCH 6/6] Bump version

---
 CHANGELOG.md          | 7 +++++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 228e3b9..bcf1665 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.27]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@b3a7c20b5c035250257d2b62851c379b159c899a
+- feat: Add `saiga` chat format by @femoiseev in #1050
+- feat: Added `chatglm3` chat format by @xaviviro in #1059
+- fix: Correct typo in README.md by @qeleb in (#1058) 
+
 ## [0.2.26]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@f6793491b5af6da75edad34d6f503ef86d31b09f
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 5db4664..d3fe66b 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.26"
\ No newline at end of file
+__version__ = "0.2.27"
\ No newline at end of file