Fix deepseek deseret regex (#7369)

On windows compiled with gcc the c++ regex library failed to handle
the characters
This commit is contained in:
Daniel Hiltgen 2024-10-26 14:58:54 -07:00 committed by GitHub
parent d7c94e0ca6
commit 099f7077a1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 88 additions and 1 deletions

View file

@ -415,7 +415,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
regex_exprs = {
"[\r\n]",
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ--ℝℤΩℨK--ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA--𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ--ℝℤΩℨK--ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA--\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
"\\s?[!-/:-~---‟ -。]+",
"\\s+$",
"[一-龥ࠀ-一가-퟿]+",

View file

@ -0,0 +1,66 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Fri, 25 Oct 2024 16:25:18 -0700
Subject: [PATCH] fix deepseek deseret regex
On windows compiled with gcc the c++ regex library failed to handle
the characters
---
src/llama-vocab.cpp | 2 +-
src/unicode.cpp | 21 +++++++++++++++++++++
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index d2f34ddd..3ef6af19 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
regex_exprs = {
"[\r\n]",
- "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ--ℝℤΩℨK--ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA--z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ--ℝℤΩℨK--ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA--\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
"\\s?[!-/:-~---‟ -。]+",
"\\s+$",
"[一-龥ࠀ-一가-퟿]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp
index f4e941cd..9d78ff16 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
#include "unicode.h"
#include "unicode-data.h"
@@ -201,8 +206,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
}
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+#ifdef _WIN32
+ int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
+ if (!wlen) {
+ throw std::invalid_argument("failed to convert regex");
+ }
+ wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
+ wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen);
+ if (!wlen) {
+ free(wbuf);
+ throw std::invalid_argument("failed to convert regex");
+ }
+ std::wstring ret = std::wstring(wbuf);
+ free(wbuf);
+ return ret;
+#else
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
return conv.from_bytes(s);
+#endif
}
static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {

21
llama/unicode.cpp vendored
View file

@ -28,6 +28,11 @@
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
#include "unicode.h"
#include "unicode-data.h"
@ -227,8 +232,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
}
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
#ifdef _WIN32
int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
if (!wlen) {
throw std::invalid_argument("failed to convert regex");
}
wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen);
if (!wlen) {
free(wbuf);
throw std::invalid_argument("failed to convert regex");
}
std::wstring ret = std::wstring(wbuf);
free(wbuf);
return ret;
#else
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
return conv.from_bytes(s);
#endif
}
static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {