From 099f7077a146917a78e136b4bbcf19ab134961d5 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Sat, 26 Oct 2024 14:58:54 -0700 Subject: [PATCH] Fix deepseek deseret regex (#7369) On windows compiled with gcc the c++ regex library failed to handle the characters --- llama/llama-vocab.cpp | 2 +- .../0012-fix-deepseek-deseret-regex.patch | 66 +++++++++++++++++++ llama/unicode.cpp | 21 ++++++ 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 llama/patches/0012-fix-deepseek-deseret-regex.patch diff --git a/llama/llama-vocab.cpp b/llama/llama-vocab.cpp index 6932de44..eff70883 100644 --- a/llama/llama-vocab.cpp +++ b/llama/llama-vocab.cpp @@ -415,7 +415,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: regex_exprs = { "[\r\n]", - "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", + "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", "\\s?[!-/:-~!-/:-~‘-‟ -。]+", "\\s+$", "[一-龥ࠀ-一가-퟿]+", diff --git a/llama/patches/0012-fix-deepseek-deseret-regex.patch b/llama/patches/0012-fix-deepseek-deseret-regex.patch new file mode 100644 index 00000000..c42cf355 --- /dev/null +++ b/llama/patches/0012-fix-deepseek-deseret-regex.patch @@ -0,0 +1,66 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Daniel Hiltgen +Date: Fri, 25 Oct 2024 16:25:18 -0700 +Subject: [PATCH] fix deepseek deseret regex + +On windows compiled with gcc the c++ regex library failed to handle +the characters +--- + src/llama-vocab.cpp | 2 +- + src/unicode.cpp | 21 +++++++++++++++++++++ + 2 files changed, 22 insertions(+), 1 deletion(-) + +diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp +index d2f34ddd..3ef6af19 100644 +--- a/src/llama-vocab.cpp ++++ b/src/llama-vocab.cpp +@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: + regex_exprs = { + "[\r\n]", +- "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", ++ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", + "\\s?[!-/:-~!-/:-~‘-‟ -。]+", + "\\s+$", + "[一-龥ࠀ-一가-퟿]+", +diff --git a/src/unicode.cpp b/src/unicode.cpp +index f4e941cd..9d78ff16 100644 +--- a/src/unicode.cpp ++++ b/src/unicode.cpp +@@ -2,6 +2,11 @@ + #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING + #endif + ++#if defined(_WIN32) ++#define WIN32_LEAN_AND_MEAN ++#include ++#endif ++ + #include "unicode.h" + #include "unicode-data.h" + +@@ -201,8 +206,24 @@ static std::unordered_map unicode_utf8_to_byte_map() { + } + + static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { ++#ifdef _WIN32 ++ int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0); ++ if (!wlen) { ++ throw std::invalid_argument("failed to convert regex"); ++ } ++ wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t)); ++ wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen); ++ if (!wlen) { ++ free(wbuf); ++ throw std::invalid_argument("failed to convert regex"); ++ } ++ std::wstring ret = std::wstring(wbuf); ++ free(wbuf); ++ return ret; ++#else + std::wstring_convert> conv; + return conv.from_bytes(s); ++#endif + } + + static std::vector unicode_byte_encoding_process(const std::vector & bpe_words) { diff --git a/llama/unicode.cpp b/llama/unicode.cpp index d7af3686..4a8946d2 100644 --- a/llama/unicode.cpp +++ b/llama/unicode.cpp @@ -28,6 +28,11 @@ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING #endif +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#include +#endif + #include "unicode.h" #include "unicode-data.h" @@ -227,8 +232,24 @@ static std::unordered_map unicode_utf8_to_byte_map() { } static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { +#ifdef _WIN32 + int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0); + if (!wlen) { + throw std::invalid_argument("failed to convert regex"); + } + wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t)); + wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen); + if (!wlen) { + free(wbuf); + throw std::invalid_argument("failed to convert regex"); + } + std::wstring ret = std::wstring(wbuf); + free(wbuf); + return ret; +#else std::wstring_convert> conv; return conv.from_bytes(s); +#endif } static std::vector unicode_byte_encoding_process(const std::vector & bpe_words) {