@@ -18,9 +18,10 @@ namespace onmt
18
18
const std::string Tokenizer::spacer_marker = " ▁" ;
19
19
const std::string Tokenizer::ph_marker_open = " ⦅" ;
20
20
const std::string Tokenizer::ph_marker_close = " ⦆" ;
21
+ const std::string Tokenizer::escaped_character_prefix = " %" ;
22
+ const size_t Tokenizer::escaped_character_width = 4 ;
21
23
static const unicode::code_point_t ph_marker_open_cp = 0xFF5F ;
22
24
static const unicode::code_point_t ph_marker_close_cp = 0xFF60 ;
23
- static const std::string protected_character = " %" ;
24
25
static const std::vector<std::pair<unicode::code_point_t , std::string>> substitutes = {
25
26
{0x2581 /* ▁ */ , " _" },
26
27
{0xFFED /* ■ */ , " ■" },
@@ -32,7 +33,6 @@ namespace onmt
32
33
33
34
static const int placeholder_alphabet = -2 ;
34
35
static const int number_alphabet = -3 ;
35
- static const int hex_value_width = 4 ;
36
36
37
37
Tokenizer::Mode Tokenizer::str_to_mode (const std::string& mode)
38
38
{
@@ -290,21 +290,23 @@ namespace onmt
290
290
291
291
static inline void unescape_characters (std::string& str)
292
292
{
293
+ const auto & prefix = Tokenizer::escaped_character_prefix;
294
+ const auto & width = Tokenizer::escaped_character_width;
295
+
293
296
for (size_t offset = 0 ;;)
294
297
{
295
- const size_t index = str.find (protected_character, offset);
296
- if (index == std::string::npos
297
- || index + protected_character.size () + hex_value_width > str.size ())
298
+ const size_t index = str.find (prefix, offset);
299
+ if (index == std::string::npos || index + prefix.size () + width > str.size ())
298
300
break ;
299
301
300
- const std::string code = str.substr (index + protected_character .size (), hex_value_width );
302
+ const std::string code = str.substr (index + prefix .size (), width );
301
303
const int v = hex_to_int (code);
302
304
const std::string c = unicode::cp_to_utf8 (v);
303
- if (c.empty () || !c[0 ] || int_to_hex (v, hex_value_width ) != code)
304
- offset = index + protected_character .size ();
305
+ if (c.empty () || !c[0 ] || int_to_hex (v, width ) != code)
306
+ offset = index + prefix .size ();
305
307
else
306
308
{
307
- str.replace (index , protected_character .size () + hex_value_width , c);
309
+ str.replace (index , prefix .size () + width , c);
308
310
offset = index + 1 ;
309
311
}
310
312
}
@@ -635,7 +637,8 @@ namespace onmt
635
637
if (_no_substitution)
636
638
append (character);
637
639
else
638
- append (protected_character + int_to_hex (character.value , hex_value_width));
640
+ append (Tokenizer::escaped_character_prefix
641
+ + int_to_hex (character.value , Tokenizer::escaped_character_width));
639
642
}
640
643
641
644
void flush_feature ()
0 commit comments