Skip to content

Commit d5c1908

Browse files
authoredFeb 28, 2023
Add tokenization option allow_isolated_marks (#316)
* Add tokenization option allow_isolated_marks * Fix missing handling of numbers and inherited script * Make the 2 conditions clearer in the code
1 parent 58e34b0 commit d5c1908

File tree

7 files changed

+68
-7
lines changed

7 files changed

+68
-7
lines changed
 

‎bindings/python/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ tokenizer = pyonmttok.Tokenizer(
6464
soft_case_regions: bool = False,
6565
no_substitution: bool = False,
6666
with_separators: bool = False,
67+
allow_isolated_marks: bool = False,
6768
preserve_placeholders: bool = False,
6869
preserve_segmented_tokens: bool = False,
6970
segment_case: bool = False,

‎bindings/python/pyonmttok/Python.cc

+5
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ class TokenizerWrapper
4747
bool soft_case_regions,
4848
bool no_substitution,
4949
bool with_separators,
50+
bool allow_isolated_marks,
5051
bool preserve_placeholders,
5152
bool preserve_segmented_tokens,
5253
bool segment_case,
@@ -69,6 +70,7 @@ class TokenizerWrapper
6970
options.lang = lang.value_or("");
7071
options.no_substitution = no_substitution;
7172
options.with_separators = with_separators;
73+
options.allow_isolated_marks = allow_isolated_marks;
7274
options.case_feature = case_feature;
7375
options.case_markup = case_markup;
7476
options.soft_case_regions = soft_case_regions;
@@ -109,6 +111,7 @@ class TokenizerWrapper
109111
"lang"_a=options.lang,
110112
"no_substitution"_a=options.no_substitution,
111113
"with_separators"_a=options.with_separators,
114+
"allow_isolated_marks"_a=options.allow_isolated_marks,
112115
"case_feature"_a=options.case_feature,
113116
"case_markup"_a=options.case_markup,
114117
"soft_case_regions"_a=options.soft_case_regions,
@@ -594,6 +597,7 @@ PYBIND11_MODULE(_ext, m)
594597
bool,
595598
bool,
596599
bool,
600+
bool,
597601
const std::optional<std::vector<std::string>>&>(),
598602
py::arg("mode"),
599603
py::kw_only(),
@@ -618,6 +622,7 @@ PYBIND11_MODULE(_ext, m)
618622
py::arg("soft_case_regions")=false,
619623
py::arg("no_substitution")=false,
620624
py::arg("with_separators")=false,
625+
py::arg("allow_isolated_marks")=false,
621626
py::arg("preserve_placeholders")=false,
622627
py::arg("preserve_segmented_tokens")=false,
623628
py::arg("segment_case")=false,

‎cli/tokenization_args.h

+3
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ inline void add_tokenization_options(cxxopts::Options& options)
1515
cxxopts::value<bool>()->default_value("false"))
1616
("with_separators", "Include separator characters in the tokenized output",
1717
cxxopts::value<bool>()->default_value("false"))
18+
("allow_isolated_marks", "Allow isolated combining marks",
19+
cxxopts::value<bool>()->default_value("false"))
1820
;
1921

2022
options.add_options("Reversible tokenization")
@@ -65,6 +67,7 @@ inline onmt::Tokenizer::Options build_tokenization_options(const cxxopts::ParseR
6567
options.lang = args["lang"].as<std::string>();
6668
options.no_substitution = args["no_substitution"].as<bool>();
6769
options.with_separators = args["with_separators"].as<bool>();
70+
options.allow_isolated_marks = args["allow_isolated_marks"].as<bool>();
6871
options.case_feature = args["case_feature"].as<bool>();
6972
options.case_markup = args["case_markup"].as<bool>();
7073
options.soft_case_regions = args["soft_case_regions"].as<bool>();

‎docs/options.md

+7
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,13 @@ A++ ++B
8585

8686
Note: this option makes the tokenized output reversible so `joiner_annotate` or `spacer_annotate` should not be used.
8787

88+
### `allow_isolated_marks` (boolean, default: `false`)
89+
90+
By default, combining marks are always attached to the previous character. When this option is enabled, the combining mark can be detached in the following conditions:
91+
92+
* when the previous character is from the "Separator" Unicode category
93+
* when `segment_alphabet_change` is enabled and the previous character does not belong to the same alphabet
94+
8895
## Case annotation
8996

9097
### `case_feature` (boolean, default: `false`)

‎include/onmt/Tokenizer.h

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ namespace onmt
4040
bool case_markup = false;
4141
bool soft_case_regions = false;
4242
bool with_separators = false;
43+
bool allow_isolated_marks = false;
4344
bool joiner_annotate = false;
4445
bool joiner_new = false;
4546
std::string joiner;

‎src/Tokenizer.cc

+31-7
Original file line numberDiff line numberDiff line change
@@ -754,12 +754,27 @@ namespace onmt
754754
}
755755

756756
static inline size_t get_next_main_char(const std::vector<unicode::CharInfo>& chars,
757-
size_t offset)
757+
const std::vector<int>& scripts,
758+
size_t offset,
759+
const Tokenizer::Options& options)
758760
{
759-
++offset;
760-
while (offset < chars.size() && chars[offset].char_type == unicode::CharType::Mark)
761-
++offset;
762-
return offset;
761+
size_t next_offset = offset + 1;
762+
763+
while (next_offset < chars.size()) {
764+
if (chars[next_offset].char_type != unicode::CharType::Mark)
765+
break;
766+
767+
if (options.allow_isolated_marks) {
768+
if (chars[offset].char_type == unicode::CharType::Separator)
769+
break;
770+
if (options.segment_alphabet_change && scripts[next_offset] != scripts[offset])
771+
break;
772+
}
773+
774+
++next_offset;
775+
}
776+
777+
return next_offset;
763778
}
764779

765780
void Tokenizer::tokenize_text(const std::string& text,
@@ -771,6 +786,13 @@ namespace onmt
771786

772787
const auto chars = unicode::get_characters_info(text);
773788

789+
std::vector<int> scripts;
790+
scripts.reserve(chars.size());
791+
for (const auto& c : chars) {
792+
const int previous_script = scripts.empty() ? -1 : scripts.back();
793+
scripts.emplace_back(unicode::get_script(c.value, previous_script));
794+
}
795+
774796
TokensBuilder builder(_options, annotated_tokens);
775797
State state = State::Space;
776798
int prev_alphabet = -1;
@@ -782,7 +804,7 @@ namespace onmt
782804
if (v < 32 || v == 0xFEFF) // skip special characters and BOM
783805
continue;
784806

785-
const size_t next_index = get_next_main_char(chars, i);
807+
const size_t next_index = get_next_main_char(chars, scripts, i, _options);
786808
const auto* next_c = next_index < chars.size() ? &chars[next_index] : nullptr;
787809
const bool has_combining_marks = (next_index != i + 1);
788810

@@ -872,7 +894,7 @@ namespace onmt
872894
if (is_number)
873895
alphabet = number_alphabet;
874896
else if (is_letter)
875-
alphabet = unicode::get_script(v, prev_alphabet);
897+
alphabet = scripts[i];
876898

877899
if (alphabets != nullptr)
878900
{
@@ -976,6 +998,8 @@ namespace onmt
976998
{
977999
builder.segment();
9781000
builder.current().join_left = true;
1001+
if (_options.preserve_segmented_tokens && c.char_type == unicode::CharType::Mark)
1002+
builder.current().preserve = true;
9791003
}
9801004

9811005
builder.safe_append(c);

‎test/test.cc

+20
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,13 @@ TEST(TokenizerTest, CombiningMarkOnSpace) {
477477
test_tok_and_detok(options, "b ̇c", "b ■%0020̇■ c");
478478
}
479479

480+
{
481+
Tokenizer::Options options;
482+
options.joiner_annotate = true;
483+
options.allow_isolated_marks = true;
484+
test_tok_and_detok(options, "b ̇c", "b ̇■ c");
485+
}
486+
480487
{
481488
Tokenizer::Options options;
482489
options.spacer_annotate = true;
@@ -753,6 +760,19 @@ TEST(TokenizerTest, SegmentAlphabetChangeCommonScript) {
753760
test_tok(options, "「キャント・バイ・ミー・ラヴ」", "「 キャント ・ バイ ・ ミー ・ ラヴ 」");
754761
}
755762

763+
TEST(TokenizerTest, SegmentAlphabetChangeIsolatedMarks) {
764+
Tokenizer::Options options;
765+
options.segment_alphabet_change = true;
766+
options.allow_isolated_marks = true;
767+
options.joiner_annotate = true;
768+
test_tok(options, "abc়", "abc ■়");
769+
test_tok(options, "8ে", "8 ■ে");
770+
test_tok(options, "ё", "ё"); // combining mark with inherited script.
771+
772+
options.preserve_segmented_tokens = true;
773+
test_tok(options, "abc়", "abc ■ ়");
774+
}
775+
756776
TEST(TokenizerTest, PreserveSegmentedNumbers) {
757777
Tokenizer::Options options;
758778
options.mode = Tokenizer::Mode::Aggressive;

0 commit comments

Comments
 (0)
Please sign in to comment.