From 5bb49ec000213520fcf5078151915eb0e4be1396 Mon Sep 17 00:00:00 2001 From: Weng Xuetian Date: Fri, 6 Dec 2024 13:37:25 -0800 Subject: [PATCH] Improve pinyin fuzzy segement algorithm Previously, we blindly choose the segment to always prefer the longer next match, this is prove wrong in the case of "sangeren". Which should produce, "san ge ren", "sang er en", "sang e ren". Instead, we change the check to be: if (current + next match) is valid, and complete pinyin, make it an acceptable option, unless (current, next match) is actually an inner fuzzy, which is handled separately below. For example: 1. For sangeren, will produce sang & san, since next match of "san", which is "ge", is a complete pinyin. 2. For hua, will only produce hua, since hu a is a inner fuzzy. Even if it will produce "extra" segement, for example, in the case of "sanger" will produce a partial pinyin "san" "ge" "r". We may still consider it as make sense. Since partial pinyin match is considered fuzzy and will have a penalty score. People may even benefit from such segement, since "san ge r" seems to be the most possible option. Fix #87 --- src/libime/pinyin/pinyinencoder.cpp | 14 +++++++------- test/testpinyinencoder.cpp | 2 ++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/libime/pinyin/pinyinencoder.cpp b/src/libime/pinyin/pinyinencoder.cpp index 671b47b..c0595ba 100644 --- a/src/libime/pinyin/pinyinencoder.cpp +++ b/src/libime/pinyin/pinyinencoder.cpp @@ -233,20 +233,20 @@ PinyinEncoder::parseUserPinyin(std::string userPinyin, fuzzyFlags, pinyinMap); auto nextMatchAlt = longestMatch(iter + str.size() - 1, end, fuzzyFlags, pinyinMap); - auto matchSize = str.size() + nextMatch.match.size(); auto matchSizeAlt = str.size() - 1 + nextMatchAlt.match.size(); - // comparator is (validPinyin, wholeMatchSize, + // comparator is (validPinyin, whole size>= lhs pinyin, // isCompletePinyin) validPinyin means it's at least some // pinyin, instead of things startsWith i,u,v. Since // longestMatch will now treat string startsWith iuv a whole // segment, we need to compare validity before the length. - // Always prefer longer match and complete pinyin match. - std::tuple compare( - nextMatch.valid, matchSize, nextMatch.isCompletePinyin); - std::tuple compareAlt( - nextMatchAlt.valid, matchSizeAlt, + // If whole size is equal to lhs pinyin, then it should be + // handled by inner segement flag. + std::tuple compare( + nextMatch.valid, true, nextMatch.isCompletePinyin); + std::tuple compareAlt( + nextMatchAlt.valid, matchSizeAlt > str.size(), nextMatchAlt.isCompletePinyin); if (compare >= compareAlt) { diff --git a/test/testpinyinencoder.cpp b/test/testpinyinencoder.cpp index 8033167..0c30638 100644 --- a/test/testpinyinencoder.cpp +++ b/test/testpinyinencoder.cpp @@ -233,6 +233,8 @@ int main() { check("zhuna", PinyinFuzzyFlag::Inner, {"zhu", "na"}); check("zhuna", PinyinFuzzyFlag::Inner, {"zhun", "a"}); + check("sangeren", PinyinFuzzyFlag::Inner, {"san", "ge", "ren"}); + { PinyinCorrectionProfile profile(BuiltinPinyinCorrectionProfile::Qwerty); auto graph = PinyinEncoder::parseUserPinyin(