From 4887973c2257437166d73cd2f34eb1fafcfca2e9 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 27 Aug 2023 12:59:20 -0400
Subject: [PATCH] Update llama.cpp

---
 llama_cpp/llama.py     |  6 +++---
 llama_cpp/llama_cpp.py | 47 +++++++++++++++++++-----------------------
 tests/test_llama.py    |  6 +++---
 vendor/llama.cpp       |  2 +-
 4 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 49c98fd..22625d8 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -445,17 +445,17 @@ class Llama:
         """
         assert self.model is not None
         output = b""
-        size = 8
+        size = 32
         buffer = (ctypes.c_char * size)()
         for token in tokens:
-            n = llama_cpp.llama_token_to_str_with_model(
+            n = llama_cpp.llama_token_to_piece_with_model(
                 self.model, llama_cpp.llama_token(token), buffer, size
             )
             assert n <= size
             output += bytes(buffer[:n])
         # NOTE: Llama1 models automatically added a space at the start of the prompt
         # this line removes a leading space if the first token is a beginning of sentence token
-        return output
+        return output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
 
     def set_cache(self, cache: Optional[BaseLlamaCache]):
         """Set the cache.
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 1731878..8cb442d 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -973,48 +973,43 @@ _lib.llama_tokenize_with_model.argtypes = [
 _lib.llama_tokenize_with_model.restype = c_int
 
 
-# // Token Id -> String. Uses the vocabulary in the provided context
-# // Does not write null terminator to the buffer
-# LLAMA_API int llama_token_to_str(
+# // Token Id -> Piece.
+# // Uses the vocabulary in the provided context.
+# // Does not write null terminator to the buffer.
+# // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
+# LLAMA_API int llama_token_to_piece(
 #         const struct llama_context * ctx,
-#                        llama_token   token,
-#                               char * buf,
-#                               int    length);
-def llama_token_to_str(
+#                         llama_token   token,
+#                                 char * buf,
+#                                 int    length);
+def llama_token_to_piece(
     ctx: llama_context_p, token: llama_token, buf: bytes, length: c_int
 ) -> int:
-    return _lib.llama_token_to_str(ctx, token, buf, length)
+    return _lib.llama_token_to_piece(ctx, token, buf, length)
 
 
-_lib.llama_tokenize_with_model.argtypes = [
-    llama_model_p,
-    c_char_p,
-    llama_token_p,
-    c_int,
-    c_bool,
-]
-_lib.llama_tokenize_with_model.restype = c_int
+_lib.llama_token_to_piece.argtypes = [llama_context_p, llama_token, c_char_p, c_int]
+_lib.llama_token_to_piece.restype = c_int
 
 
-# LLAMA_API int llama_token_to_str_with_model(
-#           const struct llama_model * model,
-#                        llama_token   token,
-#                               char * buf,
-#                               int    length);
-def llama_token_to_str_with_model(
+# LLAMA_API int llama_token_to_piece_with_model(
+#             const struct llama_model * model,
+#                         llama_token   token,
+#                                 char * buf,
+#                                 int    length);
+def llama_token_to_piece_with_model(
     model: llama_model_p, token: llama_token, buf: bytes, length: c_int
 ) -> int:
-    return _lib.llama_token_to_str_with_model(model, token, buf, length)
+    return _lib.llama_token_to_piece_with_model(model, token, buf, length)
 
 
-_lib.llama_token_to_str_with_model.argtypes = [
+_lib.llama_token_to_piece_with_model.argtypes = [
     llama_model_p,
     llama_token,
     c_char_p,
     c_int,
 ]
-_lib.llama_token_to_str_with_model.restype = c_int
-
+_lib.llama_token_to_piece_with_model.restype = c_int
 
 # //
 # // Grammar
diff --git a/tests/test_llama.py b/tests/test_llama.py
index e038a89..c240122 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -14,16 +14,16 @@ def test_llama_cpp_tokenization():
 
     tokens = llama.tokenize(text)
     assert tokens[0] == llama.token_bos()
-    assert tokens == [1, 10994, 2787]
+    assert tokens == [1, 15043, 2787]
     detokenized = llama.detokenize(tokens)
     assert detokenized == text
 
     tokens = llama.tokenize(text, add_bos=False)
     assert tokens[0] != llama.token_bos()
-    assert tokens == [10994, 2787]
+    assert tokens == [15043, 2787]
 
     detokenized = llama.detokenize(tokens)
-    assert detokenized == text
+    assert detokenized != text
 
 
 @pytest.mark.skip(reason="bug in tokenization where leading space is always inserted even if not after eos")
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index c1ac54b..c10704d 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit c1ac54b77aaba10d029084d152be786102010eb2
+Subproject commit c10704d01e21e3dbe4d6ca1026ebff85349dd239