@@ -754,12 +754,27 @@ namespace onmt
754
754
}
755
755
756
756
static inline size_t get_next_main_char (const std::vector<unicode::CharInfo>& chars,
757
- size_t offset)
757
+ const std::vector<int >& scripts,
758
+ size_t offset,
759
+ const Tokenizer::Options& options)
758
760
{
759
- ++offset;
760
- while (offset < chars.size () && chars[offset].char_type == unicode::CharType::Mark)
761
- ++offset;
762
- return offset;
761
+ size_t next_offset = offset + 1 ;
762
+
763
+ while (next_offset < chars.size ()) {
764
+ if (chars[next_offset].char_type != unicode::CharType::Mark)
765
+ break ;
766
+
767
+ if (options.allow_isolated_marks ) {
768
+ if (chars[offset].char_type == unicode::CharType::Separator)
769
+ break ;
770
+ if (options.segment_alphabet_change && scripts[next_offset] != scripts[offset])
771
+ break ;
772
+ }
773
+
774
+ ++next_offset;
775
+ }
776
+
777
+ return next_offset;
763
778
}
764
779
765
780
void Tokenizer::tokenize_text (const std::string& text,
@@ -771,6 +786,13 @@ namespace onmt
771
786
772
787
const auto chars = unicode::get_characters_info (text);
773
788
789
+ std::vector<int > scripts;
790
+ scripts.reserve (chars.size ());
791
+ for (const auto & c : chars) {
792
+ const int previous_script = scripts.empty () ? -1 : scripts.back ();
793
+ scripts.emplace_back (unicode::get_script (c.value , previous_script));
794
+ }
795
+
774
796
TokensBuilder builder (_options, annotated_tokens);
775
797
State state = State::Space;
776
798
int prev_alphabet = -1 ;
@@ -782,7 +804,7 @@ namespace onmt
782
804
if (v < 32 || v == 0xFEFF ) // skip special characters and BOM
783
805
continue ;
784
806
785
- const size_t next_index = get_next_main_char (chars, i );
807
+ const size_t next_index = get_next_main_char (chars, scripts, i, _options );
786
808
const auto * next_c = next_index < chars.size () ? &chars[next_index] : nullptr ;
787
809
const bool has_combining_marks = (next_index != i + 1 );
788
810
@@ -872,7 +894,7 @@ namespace onmt
872
894
if (is_number)
873
895
alphabet = number_alphabet;
874
896
else if (is_letter)
875
- alphabet = unicode::get_script (v, prev_alphabet) ;
897
+ alphabet = scripts[i] ;
876
898
877
899
if (alphabets != nullptr )
878
900
{
@@ -976,6 +998,8 @@ namespace onmt
976
998
{
977
999
builder.segment ();
978
1000
builder.current ().join_left = true ;
1001
+ if (_options.preserve_segmented_tokens && c.char_type == unicode::CharType::Mark)
1002
+ builder.current ().preserve = true ;
979
1003
}
980
1004
981
1005
builder.safe_append (c);
0 commit comments