Add tokenization option allow_isolated_marks (#316)

guillaumekln · web-flow · commit d5c1908d3eb7 · 2023-02-28T13:46:23.000+01:00
* Add tokenization option allow_isolated_marks

* Fix missing handling of numbers and inherited script

* Make the 2 conditions clearer in the code
diff --git a/bindings/python/README.md b/bindings/python/README.md
@@ -64,6 +64,7 @@ tokenizer = pyonmttok.Tokenizer(
     soft_case_regions: bool = False,
     no_substitution: bool = False,
     with_separators: bool = False,
+    allow_isolated_marks: bool = False,
     preserve_placeholders: bool = False,
     preserve_segmented_tokens: bool = False,
     segment_case: bool = False,
diff --git a/bindings/python/pyonmttok/Python.cc b/bindings/python/pyonmttok/Python.cc
@@ -47,6 +47,7 @@ class TokenizerWrapper
                    bool soft_case_regions,
                    bool no_substitution,
                    bool with_separators,
+                   bool allow_isolated_marks,
                    bool preserve_placeholders,
                    bool preserve_segmented_tokens,
                    bool segment_case,
@@ -69,6 +70,7 @@ class TokenizerWrapper
     options.lang = lang.value_or("");
     options.no_substitution = no_substitution;
     options.with_separators = with_separators;
+    options.allow_isolated_marks = allow_isolated_marks;
     options.case_feature = case_feature;
     options.case_markup = case_markup;
     options.soft_case_regions = soft_case_regions;
@@ -109,6 +111,7 @@ class TokenizerWrapper
       "lang"_a=options.lang,
       "no_substitution"_a=options.no_substitution,
       "with_separators"_a=options.with_separators,
+      "allow_isolated_marks"_a=options.allow_isolated_marks,
       "case_feature"_a=options.case_feature,
       "case_markup"_a=options.case_markup,
       "soft_case_regions"_a=options.soft_case_regions,
@@ -594,6 +597,7 @@ PYBIND11_MODULE(_ext, m)
          bool,
          bool,
          bool,
+         bool,
          const std::optional<std::vector<std::string>>&>(),
          py::arg("mode"),
          py::kw_only(),
@@ -618,6 +622,7 @@ PYBIND11_MODULE(_ext, m)
          py::arg("soft_case_regions")=false,
          py::arg("no_substitution")=false,
          py::arg("with_separators")=false,
+         py::arg("allow_isolated_marks")=false,
          py::arg("preserve_placeholders")=false,
          py::arg("preserve_segmented_tokens")=false,
          py::arg("segment_case")=false,
diff --git a/cli/tokenization_args.h b/cli/tokenization_args.h
@@ -15,6 +15,8 @@ inline void add_tokenization_options(cxxopts::Options& options)
      cxxopts::value<bool>()->default_value("false"))
     ("with_separators", "Include separator characters in the tokenized output",
      cxxopts::value<bool>()->default_value("false"))
+    ("allow_isolated_marks", "Allow isolated combining marks",
+     cxxopts::value<bool>()->default_value("false"))
     ;
 
   options.add_options("Reversible tokenization")
@@ -65,6 +67,7 @@ inline onmt::Tokenizer::Options build_tokenization_options(const cxxopts::ParseR
   options.lang = args["lang"].as<std::string>();
   options.no_substitution = args["no_substitution"].as<bool>();
   options.with_separators = args["with_separators"].as<bool>();
+  options.allow_isolated_marks = args["allow_isolated_marks"].as<bool>();
   options.case_feature = args["case_feature"].as<bool>();
   options.case_markup = args["case_markup"].as<bool>();
   options.soft_case_regions = args["soft_case_regions"].as<bool>();
diff --git a/docs/options.md b/docs/options.md
@@ -85,6 +85,13 @@ A++ ++B
 
 Note: this option makes the tokenized output reversible so `joiner_annotate` or `spacer_annotate` should not be used.
 
+### `allow_isolated_marks` (boolean, default: `false`)
+
+By default, combining marks are always attached to the previous character. When this option is enabled, the combining mark can be detached in the following conditions:
+
+* when the previous character is from the "Separator" Unicode category
+* when `segment_alphabet_change` is enabled and the previous character does not belong to the same alphabet
+
 ## Case annotation
 
 ### `case_feature` (boolean, default: `false`)
diff --git a/include/onmt/Tokenizer.h b/include/onmt/Tokenizer.h
@@ -40,6 +40,7 @@ namespace onmt
       bool case_markup = false;
       bool soft_case_regions = false;
       bool with_separators = false;
+      bool allow_isolated_marks = false;
       bool joiner_annotate = false;
       bool joiner_new = false;
       std::string joiner;
diff --git a/src/Tokenizer.cc b/src/Tokenizer.cc
@@ -754,12 +754,27 @@ namespace onmt
   }
 
   static inline size_t get_next_main_char(const std::vector<unicode::CharInfo>& chars,
-                                          size_t offset)
+                                          const std::vector<int>& scripts,
+                                          size_t offset,
+                                          const Tokenizer::Options& options)
   {
-    ++offset;
-    while (offset < chars.size() && chars[offset].char_type == unicode::CharType::Mark)
-      ++offset;
-    return offset;
+    size_t next_offset = offset + 1;
+
+    while (next_offset < chars.size()) {
+      if (chars[next_offset].char_type != unicode::CharType::Mark)
+        break;
+
+      if (options.allow_isolated_marks) {
+        if (chars[offset].char_type == unicode::CharType::Separator)
+          break;
+        if (options.segment_alphabet_change && scripts[next_offset] != scripts[offset])
+          break;
+      }
+
+      ++next_offset;
+    }
+
+    return next_offset;
   }
 
   void Tokenizer::tokenize_text(const std::string& text,
@@ -771,6 +786,13 @@ namespace onmt
 
     const auto chars = unicode::get_characters_info(text);
 
+    std::vector<int> scripts;
+    scripts.reserve(chars.size());
+    for (const auto& c : chars) {
+      const int previous_script = scripts.empty() ? -1 : scripts.back();
+      scripts.emplace_back(unicode::get_script(c.value, previous_script));
+    }
+
     TokensBuilder builder(_options, annotated_tokens);
     State state = State::Space;
     int prev_alphabet = -1;
@@ -782,7 +804,7 @@ namespace onmt
       if (v < 32 || v == 0xFEFF)  // skip special characters and BOM
         continue;
 
-      const size_t next_index = get_next_main_char(chars, i);
+      const size_t next_index = get_next_main_char(chars, scripts, i, _options);
       const auto* next_c = next_index < chars.size() ? &chars[next_index] : nullptr;
       const bool has_combining_marks = (next_index != i + 1);
 
@@ -872,7 +894,7 @@ namespace onmt
         if (is_number)
           alphabet = number_alphabet;
         else if (is_letter)
-          alphabet = unicode::get_script(v, prev_alphabet);
+          alphabet = scripts[i];
 
         if (alphabets != nullptr)
         {
@@ -976,6 +998,8 @@ namespace onmt
           {
             builder.segment();
             builder.current().join_left = true;
+            if (_options.preserve_segmented_tokens && c.char_type == unicode::CharType::Mark)
+              builder.current().preserve = true;
           }
 
           builder.safe_append(c);
diff --git a/test/test.cc b/test/test.cc
@@ -477,6 +477,13 @@ TEST(TokenizerTest, CombiningMarkOnSpace) {
     test_tok_and_detok(options, "b ̇c", "b ￭％0020̇￭ c");
   }
 
+  {
+    Tokenizer::Options options;
+    options.joiner_annotate = true;
+    options.allow_isolated_marks = true;
+    test_tok_and_detok(options, "b ̇c", "b ̇￭ c");
+  }
+
   {
     Tokenizer::Options options;
     options.spacer_annotate = true;
@@ -753,6 +760,19 @@ TEST(TokenizerTest, SegmentAlphabetChangeCommonScript) {
   test_tok(options, "「キャント・バイ・ミー・ラヴ」", "「 キャント ・ バイ ・ ミー ・ ラヴ 」");
 }
 
+TEST(TokenizerTest, SegmentAlphabetChangeIsolatedMarks) {
+  Tokenizer::Options options;
+  options.segment_alphabet_change = true;
+  options.allow_isolated_marks = true;
+  options.joiner_annotate = true;
+  test_tok(options, "abc়", "abc ￭়");
+  test_tok(options, "8ে", "8 ￭ে");
+  test_tok(options, "ё", "ё");  // combining mark with inherited script.
+
+  options.preserve_segmented_tokens = true;
+  test_tok(options, "abc়", "abc ￭ ়");
+}
+
 TEST(TokenizerTest, PreserveSegmentedNumbers) {
   Tokenizer::Options options;
   options.mode = Tokenizer::Mode::Aggressive;