diff --git a/llm/patches/10-tekken.diff b/llm/patches/10-tekken.diff new file mode 100644 index 00000000..56a583e0 --- /dev/null +++ b/llm/patches/10-tekken.diff @@ -0,0 +1,43 @@ +diff --git a/include/llama.h b/include/llama.h +index bb4b05ba..a92174e0 100644 +--- a/include/llama.h ++++ b/include/llama.h +@@ -92,6 +92,7 @@ extern "C" { + LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17, + LLAMA_VOCAB_PRE_TYPE_VIKING = 18, + LLAMA_VOCAB_PRE_TYPE_JAIS = 19, ++ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20, + }; + + // note: these values should be synchronized with ggml_rope +diff --git a/src/llama.cpp b/src/llama.cpp +index 18364976..435b6fe5 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -5429,6 +5429,12 @@ static void llm_load_vocab( + } else if ( + tokenizer_pre == "jais") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; ++ } else if ( ++ tokenizer_pre == "tekken") { ++ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN; ++ vocab.tokenizer_clean_spaces = false; ++ vocab.tokenizer_ignore_merges = true; ++ vocab.tokenizer_add_bos = true; + } else { + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; +@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe { + " ?[^(\\s|.,!?…。,、।۔،)]+", + }; + break; ++ case LLAMA_VOCAB_PRE_TYPE_TEKKEN: ++ // original regex from tokenizer.json ++ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ++ regex_exprs = { ++ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", ++ }; ++ break; + default: + // default regex for BPE tokenization pre-processing + regex_exprs = {